From 88e53dfa215e0178d4760d955d588308d72da9a5 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Sun, 5 Apr 2020 17:09:57 +0300
Subject: [PATCH] Added a little program for testing the bandwidths of
 different MPI comm styles on n nodes and processes

---
 samples/bwtest/CMakeLists.txt |   9 ++
 samples/bwtest/main.c         | 207 ++++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 samples/bwtest/CMakeLists.txt
 create mode 100644 samples/bwtest/main.c

diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt
new file mode 100644
index 0000000..db7066e
--- /dev/null
+++ b/samples/bwtest/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking
+
+find_package(MPI)
+find_package(OpenMP)
+find_package(CUDAToolkit)
+
+add_executable(bwtest main.c)
+add_compile_options(-O3)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart)
diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
new file mode 100644
index 0000000..d3e7303
--- /dev/null
+++ b/samples/bwtest/main.c
@@ -0,0 +1,207 @@
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <mpi.h>
+
+#include <cuda_runtime_api.h>
+
+#include "timer_hires.h" // From src/common
+
+//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
+#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
+
+/*
+  Findings:
+    - MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
+    - Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
+    - For internode one should use pinned memory (RDMA is staged through pinned, gives full
+    network speed iff pinned)
+*/
+
+static uint8_t*
+allocHost(const size_t bytes)
+{
+    uint8_t* arr = malloc(bytes);
+    assert(arr);
+    return arr;
+}
+
+static void
+freeHost(uint8_t* arr)
+{
+    free(arr);
+}
+
+static uint8_t*
+allocDevice(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    assert(retval == cudaSuccess);
+    return arr;
+}
+
+static void
+freeDevice(uint8_t* arr)
+{
+    cudaFree(arr);
+}
+
+static void
+sendrecv_blocking(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    if (!pid) {
+        MPI_Status status;
+        MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
+    }
+    else {
+        MPI_Status status;
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
+        MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
+    }
+}
+
+static void
+sendrecv_nonblocking(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    MPI_Request recv_request, send_request;
+    MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request);
+    MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request);
+
+    MPI_Status status;
+    MPI_Wait(&recv_request, &status);
+    MPI_Wait(&send_request, &status);
+}
+
+static void
+sendrecv_twoway(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    MPI_Status status;
+    MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback,
+                 MPI_COMM_WORLD, &status);
+}
+
+#define PRINT                                                                                      \
+    if (!pid)                                                                                      \
+    printf
+
+static void
+measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
+          uint8_t* dst)
+{
+    const size_t num_samples = 10;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    PRINT("%s\n", msg);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("\tWarming up... ");
+    for (size_t i = 0; i < num_samples / 10; ++i)
+        sendrecv(src, dst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    PRINT("Done\n");
+
+    PRINT("\tBandwidth... ");
+    fflush(stdout);
+
+    Timer t;
+    MPI_Barrier(MPI_COMM_WORLD);
+    timer_reset(&t);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (size_t i = 0; i < num_samples; ++i)
+        sendrecv(src, dst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
+    PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
+    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000);
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+int
+main(void)
+{
+    // Disable stdout buffering
+    setbuf(stdout, NULL);
+
+    MPI_Init(NULL, NULL);
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    assert(nprocs >= 2); // Require at least one neighbor
+
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+    const int device_id = pid % devices_per_node;
+    cudaSetDevice(device_id);
+
+    printf("Process %d of %d running.\n", pid, nprocs);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
+
+    {
+        uint8_t* src = allocHost(BLOCK_SIZE);
+        uint8_t* dst = allocHost(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+
+        freeHost(src);
+        freeHost(dst);
+    }
+
+    {
+        uint8_t* src = allocDevice(BLOCK_SIZE);
+        uint8_t* dst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}