Merged mpi-to-master-merge-candidate-2020-06-01 here

2020-06-24 16:08:14 +03:00
parent 0030db01f3 0d1c5b3911
commit ff1a601f85
12 changed files with 373 additions and 239 deletions
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -39,8 +39,47 @@ typedef enum {
    NUM_TESTS,
 } TestType;

+#include <stdint.h>
+
+typedef struct {
+    uint64_t x, y, z;
+} uint3_64;
+
+static uint3_64
+operator+(const uint3_64& a, const uint3_64& b)
+{
+    return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static uint3_64
+morton3D(const uint64_t pid)
+{
+    uint64_t i, j, k;
+    i = j = k = 0;
+
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << 3 * bit;
+        k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+        i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+    }
+
+    return (uint3_64){i, j, k};
+}
+
+static uint3_64
+decompose(const uint64_t target)
+{
+    // This is just so beautifully elegant. Complex and efficient decomposition
+    // in just one line of code.
+    uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
+
+    ERRCHK_ALWAYS(p.x * p.y * p.z == target);
+    return p;
+}
+
 int
-main(void)
+main(int argc, char** argv)
 {
    MPI_Init(NULL, NULL);
    int nprocs, pid;
@@ -51,9 +90,30 @@ main(void)
    AcMeshInfo info;
    acLoadConfig(AC_DEFAULT_CONFIG, &info);

+    if (argc > 1) {
+        if (argc == 4) {
+            const int nx           = atoi(argv[1]);
+            const int ny           = atoi(argv[2]);
+            const int nz           = atoi(argv[3]);
+            info.int_params[AC_nx] = nx;
+            info.int_params[AC_ny] = ny;
+            info.int_params[AC_nz] = nz;
+            acUpdateBuiltinParams(&info);
+            printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
+        }
+        else {
+            fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
    const TestType test = TEST_STRONG_SCALING;
-    if (test == TEST_WEAK_SCALING)
-        info.int_params[AC_nz] *= nprocs;
+    if (test == TEST_WEAK_SCALING) {
+        uint3_64 decomp = decompose(nprocs);
+        info.int_params[AC_nx] *= decomp.x;
+        info.int_params[AC_ny] *= decomp.y;
+        info.int_params[AC_nz] *= decomp.z;
+    }

    /*
    AcMesh model, candidate;
@@ -115,7 +175,7 @@ main(void)
    */

    // Percentiles
-    const size_t num_iters      = 100;
+    const size_t num_iters      = 1000;
    const double nth_percentile = 0.90;
    std::vector<double> results; // ms
    results.reserve(num_iters);
--- a/samples/bwtest/CMakeLists.txt
+++ b/samples/bwtest/CMakeLists.txt
@@ -5,5 +5,5 @@ find_package(OpenMP)
 find_package(CUDAToolkit)

 add_executable(bwtest main.c)
-target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
 target_compile_options(bwtest PRIVATE -O3)
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -6,6 +6,7 @@

 #include <mpi.h>

+#include <cuda.h> // CUDA driver API
 #include <cuda_runtime_api.h>

 #include "timer_hires.h" // From src/common
@@ -13,7 +14,13 @@
 //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
 #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)

-#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
+#define errchk(x)                                                                                  \
+    {                                                                                              \
+        if (!(x)) {                                                                                \
+            fprintf(stderr, "errchk(%s) failed", #x);                                              \
+            assert(x);                                                                             \
+        }                                                                                          \
+    }

 /*
  Findings:
@@ -56,6 +63,18 @@ allocDevice(const size_t bytes)
 static uint8_t*
 allocDevicePinned(const size_t bytes)
 {
+#define USE_CUDA_DRIVER_PINNING (1)
+#if USE_CUDA_DRIVER_PINNING
+    uint8_t* arr = allocDevice(bytes);
+
+    unsigned int flag = 1;
+    CUresult retval   = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                            (CUdeviceptr)arr);
+
+    errchk(retval == CUDA_SUCCESS);
+    return arr;
+
+#else
    uint8_t* arr;
    // Standard (20 GiB/s internode, 85 GiB/s intranode)
    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
@@ -65,8 +84,24 @@ allocDevicePinned(const size_t bytes)
    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
    errchk(retval == cudaSuccess);
    return arr;
+#endif
 }

+/*
+static uint8_t*
+allocDevicePinned(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    errchk(retval == cudaSuccess);
+    return arr;
+}*/
+
 static void
 freeDevice(uint8_t* arr)
 {
@@ -239,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst)
    cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
 }

-
 static void
 sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
 {
@@ -299,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
    MPI_Barrier(MPI_COMM_WORLD);
 }

-
 static void
-measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
-                                                                                                            uint8_t* hsrc, uint8_t* ddst)
+measurebw2(const char* msg, const size_t bytes,
+           void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
+           uint8_t* hsrc, uint8_t* ddst)
 {
    const size_t num_samples = 100;

@@ -386,8 +420,8 @@ main(void)
        measurebw("Bidirectional bandwidth, twoway (Host)", //
                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
        measurebw("Bidirectional bandwidth, async multiple (Host)", //
-                  2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        //measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
        //          2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);

        freeHost(src);
@@ -406,11 +440,12 @@ main(void)
        measurebw("Bidirectional bandwidth, twoway (Device)", //
                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
        measurebw("Bidirectional bandwidth, async multiple (Device)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        //measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
        //          2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
        measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
+                  dst);

        freeDevice(src);
        freeDevice(dst);
@@ -428,7 +463,7 @@ main(void)
        measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
        measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);

        freeDevice(src);
        freeDevice(dst);
@@ -444,7 +479,8 @@ main(void)
        measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
        measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);

-        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
+                   ddst);

        freeDevice(dsrc);
        freeDevice(ddst);
@@ -462,7 +498,8 @@ main(void)
        measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
        measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);

-        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
+                   hsrc, ddst);

        freeDevice(dsrc);
        freeDevice(ddst);
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -29,6 +29,7 @@ main(void)
        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
        fprintf(fp, "#SBATCH -n %d\n", nprocs);
        fprintf(fp, "#SBATCH -N %d\n", nodes);
+        fprintf(fp, "#SBATCH --exclusive\n");

        // Modules
        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
@@ -36,8 +37,14 @@ main(void)

        // Profile and run
        fprintf(fp, "mkdir -p profile_%d\n", nprocs);
-        fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
-                nprocs);
+
+        const int nx = 256; // max size 1792;
+        const int ny = nx;
+        const int nz = nx;
+        fprintf(fp,
+                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
+                //"%d\n",
+                "srun ./benchmark %d %d %d\n", nx, ny, nz);

        fclose(fp);
    }