Merged mpi-to-master-merge-candidate-2020-06-01 here

This commit is contained in:
jpekkila
2020-06-24 16:08:14 +03:00
12 changed files with 373 additions and 239 deletions

View File

@@ -39,8 +39,47 @@ typedef enum {
NUM_TESTS,
} TestType;
#include <stdint.h>
typedef struct {
uint64_t x, y, z;
} uint3_64;
static uint3_64
operator+(const uint3_64& a, const uint3_64& b)
{
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
}
static uint3_64
morton3D(const uint64_t pid)
{
uint64_t i, j, k;
i = j = k = 0;
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << 3 * bit;
k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
}
return (uint3_64){i, j, k};
}
static uint3_64
decompose(const uint64_t target)
{
// This is just so beautifully elegant. Complex and efficient decomposition
// in just one line of code.
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
return p;
}
int
main(void)
main(int argc, char** argv)
{
MPI_Init(NULL, NULL);
int nprocs, pid;
@@ -51,9 +90,30 @@ main(void)
AcMeshInfo info;
acLoadConfig(AC_DEFAULT_CONFIG, &info);
if (argc > 1) {
if (argc == 4) {
const int nx = atoi(argv[1]);
const int ny = atoi(argv[2]);
const int nz = atoi(argv[3]);
info.int_params[AC_nx] = nx;
info.int_params[AC_ny] = ny;
info.int_params[AC_nz] = nz;
acUpdateBuiltinParams(&info);
printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
}
else {
fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
exit(EXIT_FAILURE);
}
}
const TestType test = TEST_STRONG_SCALING;
if (test == TEST_WEAK_SCALING)
info.int_params[AC_nz] *= nprocs;
if (test == TEST_WEAK_SCALING) {
uint3_64 decomp = decompose(nprocs);
info.int_params[AC_nx] *= decomp.x;
info.int_params[AC_ny] *= decomp.y;
info.int_params[AC_nz] *= decomp.z;
}
/*
AcMesh model, candidate;
@@ -115,7 +175,7 @@ main(void)
*/
// Percentiles
const size_t num_iters = 100;
const size_t num_iters = 1000;
const double nth_percentile = 0.90;
std::vector<double> results; // ms
results.reserve(num_iters);

View File

@@ -5,5 +5,5 @@ find_package(OpenMP)
find_package(CUDAToolkit)
add_executable(bwtest main.c)
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
target_compile_options(bwtest PRIVATE -O3)

View File

@@ -6,6 +6,7 @@
#include <mpi.h>
#include <cuda.h> // CUDA driver API
#include <cuda_runtime_api.h>
#include "timer_hires.h" // From src/common
@@ -13,7 +14,13 @@
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
#define errchk(x) \
{ \
if (!(x)) { \
fprintf(stderr, "errchk(%s) failed", #x); \
assert(x); \
} \
}
/*
Findings:
@@ -56,6 +63,18 @@ allocDevice(const size_t bytes)
static uint8_t*
allocDevicePinned(const size_t bytes)
{
#define USE_CUDA_DRIVER_PINNING (1)
#if USE_CUDA_DRIVER_PINNING
uint8_t* arr = allocDevice(bytes);
unsigned int flag = 1;
CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
(CUdeviceptr)arr);
errchk(retval == CUDA_SUCCESS);
return arr;
#else
uint8_t* arr;
// Standard (20 GiB/s internode, 85 GiB/s intranode)
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
@@ -65,8 +84,24 @@ allocDevicePinned(const size_t bytes)
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
errchk(retval == cudaSuccess);
return arr;
#endif
}
/*
static uint8_t*
allocDevicePinned(const size_t bytes)
{
uint8_t* arr;
// Standard (20 GiB/s internode, 85 GiB/s intranode)
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
errchk(retval == cudaSuccess);
return arr;
}*/
static void
freeDevice(uint8_t* arr)
{
@@ -239,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst)
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
}
static void
sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
{
@@ -299,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
MPI_Barrier(MPI_COMM_WORLD);
}
static void
measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
uint8_t* hsrc, uint8_t* ddst)
measurebw2(const char* msg, const size_t bytes,
void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
uint8_t* hsrc, uint8_t* ddst)
{
const size_t num_samples = 100;
@@ -386,8 +420,8 @@ main(void)
measurebw("Bidirectional bandwidth, twoway (Host)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Host)", //
2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
//measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
// measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
// 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
freeHost(src);
@@ -406,11 +440,12 @@ main(void)
measurebw("Bidirectional bandwidth, twoway (Device)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device)", //
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
//measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
// measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
dst);
freeDevice(src);
freeDevice(dst);
@@ -428,7 +463,7 @@ main(void)
measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
freeDevice(src);
freeDevice(dst);
@@ -444,7 +479,8 @@ main(void)
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
ddst);
freeDevice(dsrc);
freeDevice(ddst);
@@ -462,7 +498,8 @@ main(void)
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
hsrc, ddst);
freeDevice(dsrc);
freeDevice(ddst);

View File

@@ -29,6 +29,7 @@ main(void)
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
fprintf(fp, "#SBATCH -n %d\n", nprocs);
fprintf(fp, "#SBATCH -N %d\n", nodes);
fprintf(fp, "#SBATCH --exclusive\n");
// Modules
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
@@ -36,8 +37,14 @@ main(void)
// Profile and run
fprintf(fp, "mkdir -p profile_%d\n", nprocs);
fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
nprocs);
const int nx = 256; // max size 1792;
const int ny = nx;
const int nz = nx;
fprintf(fp,
//"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
//"%d\n",
"srun ./benchmark %d %d %d\n", nx, ny, nz);
fclose(fp);
}