From 88e53dfa215e0178d4760d955d588308d72da9a5 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Sun, 5 Apr 2020 17:09:57 +0300 Subject: [PATCH] Added a little program for testing the bandwidths of different MPI comm styles on n nodes and processes --- samples/bwtest/CMakeLists.txt | 9 ++ samples/bwtest/main.c | 207 ++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 samples/bwtest/CMakeLists.txt create mode 100644 samples/bwtest/main.c diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt new file mode 100644 index 0000000..db7066e --- /dev/null +++ b/samples/bwtest/CMakeLists.txt @@ -0,0 +1,9 @@ +cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking + +find_package(MPI) +find_package(OpenMP) +find_package(CUDAToolkit) + +add_executable(bwtest main.c) +add_compile_options(-O3) +target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart) diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c new file mode 100644 index 0000000..d3e7303 --- /dev/null +++ b/samples/bwtest/main.c @@ -0,0 +1,207 @@ +#include +#include +#include +#include + +#include + +#include + +#include "timer_hires.h" // From src/common + +//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes +#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8) + +/* + Findings: + - MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly + - Need to use cudaMalloc for intranode comm for P2P to trigger with MPI + - For internode one should use pinned memory (RDMA is staged through pinned, gives full + network speed iff pinned) +*/ + +static uint8_t* +allocHost(const size_t bytes) +{ + uint8_t* arr = malloc(bytes); + assert(arr); + return arr; +} + +static void +freeHost(uint8_t* arr) +{ + free(arr); +} + +static uint8_t* +allocDevice(const size_t bytes) +{ + uint8_t* arr; + // Standard (20 GiB/s internode, 85 GiB/s intranode) + // const cudaError_t retval = cudaMalloc((void**)&arr, bytes); + // Unified mem (5 GiB/s internode, 6 GiB/s intranode) + // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal); + // Pinned (40 GiB/s internode, 10 GiB/s intranode) + const cudaError_t retval = cudaMallocHost((void**)&arr, bytes); + assert(retval == cudaSuccess); + return arr; +} + +static void +freeDevice(uint8_t* arr) +{ + cudaFree(arr); +} + +static void +sendrecv_blocking(uint8_t* src, uint8_t* dst) +{ + int pid, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + int nfront = (pid + 1) % nprocs; + int nback = (((pid - 1) % nprocs) + nprocs) % nprocs; + + if (!pid) { + MPI_Status status; + MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD); + MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status); + } + else { + MPI_Status status; + MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status); + MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD); + } +} + +static void +sendrecv_nonblocking(uint8_t* src, uint8_t* dst) +{ + int pid, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + int nfront = (pid + 1) % nprocs; + int nback = (((pid - 1) % nprocs) + nprocs) % nprocs; + + MPI_Request recv_request, send_request; + MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request); + MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request); + + MPI_Status status; + MPI_Wait(&recv_request, &status); + MPI_Wait(&send_request, &status); +} + +static void +sendrecv_twoway(uint8_t* src, uint8_t* dst) +{ + int pid, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + int nfront = (pid + 1) % nprocs; + int nback = (((pid - 1) % nprocs) + nprocs) % nprocs; + + MPI_Status status; + MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback, + MPI_COMM_WORLD, &status); +} + +#define PRINT \ + if (!pid) \ + printf + +static void +measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src, + uint8_t* dst) +{ + const size_t num_samples = 10; + + int pid, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + PRINT("%s\n", msg); + MPI_Barrier(MPI_COMM_WORLD); + + PRINT("\tWarming up... "); + for (size_t i = 0; i < num_samples / 10; ++i) + sendrecv(src, dst); + + MPI_Barrier(MPI_COMM_WORLD); + PRINT("Done\n"); + + PRINT("\tBandwidth... "); + fflush(stdout); + + Timer t; + MPI_Barrier(MPI_COMM_WORLD); + timer_reset(&t); + MPI_Barrier(MPI_COMM_WORLD); + + for (size_t i = 0; i < num_samples; ++i) + sendrecv(src, dst); + + MPI_Barrier(MPI_COMM_WORLD); + const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds + PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024)); + PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000); + MPI_Barrier(MPI_COMM_WORLD); +} + +int +main(void) +{ + // Disable stdout buffering + setbuf(stdout, NULL); + + MPI_Init(NULL, NULL); + + int pid, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + assert(nprocs >= 2); // Require at least one neighbor + + int devices_per_node = -1; + cudaGetDeviceCount(&devices_per_node); + const int device_id = pid % devices_per_node; + cudaSetDevice(device_id); + + printf("Process %d of %d running.\n", pid, nprocs); + MPI_Barrier(MPI_COMM_WORLD); + + PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024)); + + { + uint8_t* src = allocHost(BLOCK_SIZE); + uint8_t* dst = allocHost(BLOCK_SIZE); + + measurebw("Unidirectional bandwidth, blocking (Host)", // + 2 * BLOCK_SIZE, sendrecv_blocking, src, dst); + measurebw("Bidirectional bandwidth, async (Host)", // + 2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst); + measurebw("Bidirectional bandwidth, twoway (Host)", // + 2 * BLOCK_SIZE, sendrecv_twoway, src, dst); + + freeHost(src); + freeHost(dst); + } + + { + uint8_t* src = allocDevice(BLOCK_SIZE); + uint8_t* dst = allocDevice(BLOCK_SIZE); + + measurebw("Unidirectional bandwidth, blocking (Device)", // + 2 * BLOCK_SIZE, sendrecv_blocking, src, dst); + measurebw("Bidirectional bandwidth, async (Device)", // + 2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst); + measurebw("Bidirectional bandwidth, twoway (Device)", // + 2 * BLOCK_SIZE, sendrecv_twoway, src, dst); + + freeDevice(src); + freeDevice(dst); + } + + MPI_Finalize(); + return EXIT_SUCCESS; +}