diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93cc614..235b25b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,3 +127,19 @@ if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
   set_cxx_options(one-sided-gpu)
   set_cxx_standard(one-sided-gpu)
 endif()
+
+if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
+  add_executable(       persistent-gpu persistent_gpu.cpp)
+  target_link_libraries(persistent-gpu MPI::MPI_CXX)
+  target_link_libraries(persistent-gpu CUDA::cudart)
+  set_cxx_options(      persistent-gpu)
+  set_cxx_standard(     persistent-gpu)
+endif()
+
+if (MPI_FOUND AND CMAKE_CUDA_COMPILER)
+  add_executable(send-recv-gpu send_recv_gpu.cpp)
+  target_link_libraries(send-recv-gpu MPI::MPI_CXX)
+  target_link_libraries(send-recv-gpu CUDA::cudart)
+  set_cxx_options(send-recv-gpu)
+  set_cxx_standard(send-recv-gpu)
+endif()
diff --git a/README.md b/README.md
index 80a05ed..8bde800 100644
--- a/README.md
+++ b/README.md
@@ -4,19 +4,23 @@ Various standalone C++ MPI tests/examples/benchmarks.
 
 If CUDA is detected, additional binaries can be built.
 
-| name              | Kind      | Reqs.    | Ranks | Description |
-|-------------------|-----------|----------|-------|-------------|
-|[hello-world][1]   | Test      | MPI      | 1+    | an MPI hello-world |
-|[one-sided][2]    | Test      | MPI      | 2     | one-sided communication |
-|[one-sided-gpu][3] | Test      | MPI+CUDA | 2     | one-sided communication on GPU buffer |
-|[persistent][4]    | Benchmark | MPI      | 2     | ping-pong time for persistent Send/Recv |
-|[send-recv][5]     | Benchmark | MPI      | 2     | ping-pong time for Send/Recv |
+| name               | Kind      | Reqs.    | Ranks | Description |
+|--------------------|-----------|----------|-------|-------------|
+|[hello-world][1]    | Test      | MPI      | 1+    | an MPI hello-world |
+|[one-sided][2]      | Test      | MPI      | 2     | one-sided communication |
+|[one-sided-gpu][3]  | Test      | MPI+CUDA | 2     | one-sided communication on GPU buffer |
+|[persistent][4]     | Benchmark | MPI      | 2     | ping-pong time for persistent Send/Recv |
+|[persistent-gpu][5] | Benchmark | MPI+CUDA | 2     | ping-pong time for persistent Send/Recv on GPU buffer|
+|[send-recv][6]      | Benchmark | MPI      | 2     | ping-pong time for Send/Recv |
+|[send-recv-gpu][7]  | Benchmark | MPI+CUDA | 2     | ping-pong time for Send/Recv on GPU buffer|
 
 [1]: https://github.com/cwpearson/mpi_test/blob/master/hello_world.cpp
 [2]: https://github.com/cwpearson/mpi_test/blob/master/one_sided.cpp
 [3]: https://github.com/cwpearson/mpi_test/blob/master/one_sided_gpu.cpp
 [4]: https://github.com/cwpearson/mpi_test/blob/master/persistent.cpp
-[5]: https://github.com/cwpearson/mpi_test/blob/master/send_recv.cpp
+[5]: https://github.com/cwpearson/mpi_test/blob/master/persistent_gpu.cpp
+[6]: https://github.com/cwpearson/mpi_test/blob/master/send_recv.cpp
+[7]: https://github.com/cwpearson/mpi_test/blob/master/send_recv_gpu.cpp
 
 ## Build
 ```
@@ -54,8 +58,8 @@ Execute any binary you want using `mpirun`, or whatever is appropriate for your
 
 * 1 node:  `jsrun -n 1 ./persistent`
 * 2 nodes: `jsrun -n 2 -r 1 -a 1 ./persistent`
-
+* 2 nodes w/GPU: `jsrun --smpi="-gpu" -n 2 -r 1 -g 1 ./send-recv-gpu`
 
 ## Notes on specific platforms
 
-Some Open MPIs use `long long` for their datatypes, which means we can't support ANSI C++ (`-ansi`).
+Some Open MPIs use `long long` for their datatypes, which is not a part of ANSI C++ (`-ansi`).
diff --git a/persistent_gpu.cpp b/persistent_gpu.cpp
new file mode 100644
index 0000000..084b22e
--- /dev/null
+++ b/persistent_gpu.cpp
@@ -0,0 +1,161 @@
+#include <cstdio>
+#include <vector>
+#include <cstdlib>
+
+#include <mpi.h>
+#include <cuda_runtime.h>
+
+inline void checkCuda(cudaError_t result, const char *file, const int line) {
+  if (result != cudaSuccess) {
+    fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result));
+    exit(-1);
+  }
+}
+#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__);
+
+const float sample_target = 200e-6;
+
+struct Sample {
+  double raw;
+  double norm;
+};
+
+static Sample get_sample(int perSample, MPI_Request *sreq, MPI_Request *rreq, int rank, MPI_Comm comm) {
+  Sample sample;
+  MPI_Barrier(comm);
+  double start = MPI_Wtime();
+  for (int i = 0; i < perSample; ++i) {
+    if (0 == rank) {
+      MPI_Start(sreq);
+      MPI_Wait(sreq, MPI_STATUS_IGNORE);
+      MPI_Start(rreq);
+      MPI_Wait(rreq, MPI_STATUS_IGNORE);
+    } else if (1 == rank) {
+      MPI_Start(rreq);
+      MPI_Wait(rreq, MPI_STATUS_IGNORE);
+      MPI_Start(sreq);
+      MPI_Wait(sreq, MPI_STATUS_IGNORE);
+    }
+  }
+  double stop = MPI_Wtime();
+  sample.raw = stop-start;
+  sample.norm = sample.raw / perSample;
+  return sample;
+}
+
+int main(int argc, char **argv) {
+  // Initialize the MPI environment
+  MPI_Init(&argc, &argv);
+
+  // Get the number of processes
+  int size, rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  if (size < 2) {
+    printf("need at least 2 ranks!\n");
+    exit(1);
+  }
+
+  int src = (rank + 1) % 2;
+  int dst = (rank + 1) % 2;
+  int tag = 0;
+  MPI_Request sreq, rreq;
+  int numIters = 100;
+
+
+  std::vector<size_t> sweep{
+    1,
+    64,
+    128,
+    256,
+    512,
+    1 * 1024,
+    2 * 1024,
+    4 * 1024,
+    8 * 1024,
+    16 * 1024,
+    32 * 1024,
+    64 * 1024,
+    128 * 1024,
+    256 * 1024,
+    512 * 1024,
+    1 * 1024 * 1024,
+    2 * 1024 * 1024,
+    4 * 1024 * 1024,
+    8 * 1024 * 1024,
+    16 * 1024 * 1024,
+    32 * 1024 * 1024,
+    64 * 1024 * 1024,
+    128 * 1024 * 1024,
+    256 * 1024 * 1024,
+  };
+
+  if (0 == rank) {
+    printf("bytes,min,max,avg,med\n");
+  }
+
+  for (size_t bytes : sweep) {
+    std::vector<double> samples(numIters);
+    char *buf;
+    CUDA_RUNTIME(cudaMalloc(&buf, bytes));
+
+    MPI_Send_init(buf, bytes, MPI_BYTE, dst, tag, MPI_COMM_WORLD, &sreq);
+    MPI_Recv_init(buf, bytes, MPI_BYTE, src, tag, MPI_COMM_WORLD, &rreq);
+
+    // try to reach 200us / sample
+    int perSample = 1;
+    for (int i = 0; i < 10; ++i) {
+      double sample = get_sample(perSample, &sreq, &rreq, rank, MPI_COMM_WORLD).raw;
+      // estimate number of measurements per sample
+      int guess = sample_target / sample + /*rounding*/0.5;
+      // close half the distance to this estimate
+      perSample += (guess - perSample) * 0.5;
+      if (perSample < 1) perSample = 1;
+      MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    } 
+
+    if (0 == rank) {
+      fprintf(stderr, "sample averaged over %d iterations\n", perSample);
+    }
+
+    for (int i = 0; i < numIters; ++i) {
+      samples[i] = get_sample(perSample, &sreq, &rreq, rank, MPI_COMM_WORLD).norm;
+    }
+
+    // each sample is the max time observed
+    MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
+    // bubble sort
+    bool changed = true;
+    while (changed) {
+      changed = false;
+      for (int i = 0; i < numIters - 1; ++i) {
+        if (samples[i] > samples[i+1]) {
+          double tmp = samples[i+1];
+          samples[i+1] = samples[i];
+          samples[i] = tmp;
+          changed = true;
+        }
+      }
+    }
+
+    // average
+    double avg = 0;
+    for (int i = 0; i < numIters; ++i) {
+      avg += samples[i];
+    }
+    avg /= numIters;
+
+    if (0 == rank) {
+      printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]);
+    }
+
+    MPI_Request_free(&sreq);
+    MPI_Request_free(&rreq);
+    CUDA_RUNTIME(cudaFree(buf));
+  }
+
+  MPI_Finalize();
+  return 0;
+}
diff --git a/send_recv_gpu.cpp b/send_recv_gpu.cpp
new file mode 100644
index 0000000..8fdef03
--- /dev/null
+++ b/send_recv_gpu.cpp
@@ -0,0 +1,149 @@
+#include <cstdio>
+#include <vector>
+#include <cstdlib>
+
+#include <mpi.h>
+#include <cuda_runtime.h>
+
+inline void checkCuda(cudaError_t result, const char *file, const int line) {
+  if (result != cudaSuccess) {
+    fprintf(stderr, "%s:%d: CUDA Runtime Error %d: %s\n", file, line, int(result), cudaGetErrorString(result));
+    exit(-1);
+  }
+}
+#define CUDA_RUNTIME(stmt) checkCuda(stmt, __FILE__, __LINE__);
+
+const float sample_target = 200e-6;
+
+struct Sample {
+  double raw;
+  double norm;
+};
+
+static Sample get_sample(int perSample, void *buf, int bytes, int rank, int other, MPI_Comm comm) {
+  Sample sample;
+  int tag = 0;
+  MPI_Barrier(comm);
+  double start = MPI_Wtime();
+  for (int i = 0; i < perSample; ++i) {
+    if (0 == rank) {
+      MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm);
+      MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE);
+    } else if (1 == rank) {
+      MPI_Recv(buf, bytes, MPI_BYTE, other, tag, comm, MPI_STATUS_IGNORE);
+      MPI_Send(buf, bytes, MPI_BYTE, other, tag, comm);
+    }
+  }
+  double stop = MPI_Wtime();
+  sample.raw = stop-start;
+  sample.norm = sample.raw / perSample;
+  return sample;
+}
+
+int main(int argc, char **argv) {
+  // Initialize the MPI environment
+  MPI_Init(&argc, &argv);
+
+  // Get the number of processes
+  int size, rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  if (size < 2) {
+    printf("need at least 2 ranks!\n");
+    exit(1);
+  }
+
+  int other = (rank + 1) % 2;
+  int numIters = 100;
+
+  std::vector<size_t> sweep{
+    1,
+    64,
+    128,
+    256,
+    512,
+    1 * 1024,
+    2 * 1024,
+    4 * 1024,
+    8 * 1024,
+    16 * 1024,
+    32 * 1024,
+    64 * 1024,
+    128 * 1024,
+    256 * 1024,
+    512 * 1024,
+    1 * 1024 * 1024,
+    2 * 1024 * 1024,
+    4 * 1024 * 1024,
+    8 * 1024 * 1024,
+    16 * 1024 * 1024,
+    32 * 1024 * 1024,
+    64 * 1024 * 1024,
+    128 * 1024 * 1024,
+    256 * 1024 * 1024,
+  };
+
+  if (0 == rank) {
+    printf("bytes,min,max,avg,med\n");
+  }
+
+  for (size_t bytes : sweep) {
+    std::vector<double> samples(numIters);
+    char *buf;
+    CUDA_RUNTIME(cudaMalloc(&buf, bytes));
+
+    // try to reach 200us / sample
+    int perSample = 1;
+    for (int i = 0; i < 10; ++i) {
+      double sample = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).raw;
+      // estimate number of measurements per sample
+      int guess = sample_target / sample + /*rounding*/0.5;
+      // close half the distance to this estimate
+      perSample += (guess - perSample) * 0.5;
+      if (perSample < 1) perSample = 1;
+      MPI_Bcast(&perSample, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    } 
+
+    if (0 == rank) {
+      fprintf(stderr, "sample averaged over %d iterations\n", perSample);
+    }
+
+    for (int i = 0; i < numIters; ++i) {
+      samples[i] = get_sample(perSample, buf, bytes, rank, other, MPI_COMM_WORLD).norm;
+    }
+
+    // each sample is the max time observed
+    MPI_Allreduce(MPI_IN_PLACE, samples.data(), numIters, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
+    // bubble sort
+    bool changed = true;
+    while (changed) {
+      changed = false;
+      for (int i = 0; i < numIters - 1; ++i) {
+        if (samples[i] > samples[i+1]) {
+          double tmp = samples[i+1];
+          samples[i+1] = samples[i];
+          samples[i] = tmp;
+          changed = true;
+        }
+      }
+    }
+
+    // average
+    double avg = 0;
+    for (int i = 0; i < numIters; ++i) {
+      avg += samples[i];
+    }
+    avg /= numIters;
+
+    if (0 == rank) {
+      printf("%lu,%e,%e,%e,%e\n", bytes, samples[0], samples[numIters-1], avg, samples[numIters/2]);
+    }
+
+    CUDA_RUNTIME(cudaFree(buf));
+  }
+
+  MPI_Finalize();
+  return 0;
+}