Merged mpi-to-master-merge-candidate-2020-06-01 here
This commit is contained in:
@@ -39,8 +39,47 @@ typedef enum {
|
||||
NUM_TESTS,
|
||||
} TestType;
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y, z;
|
||||
} uint3_64;
|
||||
|
||||
static uint3_64
|
||||
operator+(const uint3_64& a, const uint3_64& b)
|
||||
{
|
||||
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||
}
|
||||
|
||||
static uint3_64
|
||||
morton3D(const uint64_t pid)
|
||||
{
|
||||
uint64_t i, j, k;
|
||||
i = j = k = 0;
|
||||
|
||||
for (int bit = 0; bit <= 21; ++bit) {
|
||||
const uint64_t mask = 0x1l << 3 * bit;
|
||||
k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
||||
i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||
}
|
||||
|
||||
return (uint3_64){i, j, k};
|
||||
}
|
||||
|
||||
static uint3_64
|
||||
decompose(const uint64_t target)
|
||||
{
|
||||
// This is just so beautifully elegant. Complex and efficient decomposition
|
||||
// in just one line of code.
|
||||
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
|
||||
|
||||
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
|
||||
return p;
|
||||
}
|
||||
|
||||
int
|
||||
main(void)
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
MPI_Init(NULL, NULL);
|
||||
int nprocs, pid;
|
||||
@@ -51,9 +90,30 @@ main(void)
|
||||
AcMeshInfo info;
|
||||
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
||||
|
||||
if (argc > 1) {
|
||||
if (argc == 4) {
|
||||
const int nx = atoi(argv[1]);
|
||||
const int ny = atoi(argv[2]);
|
||||
const int nz = atoi(argv[3]);
|
||||
info.int_params[AC_nx] = nx;
|
||||
info.int_params[AC_ny] = ny;
|
||||
info.int_params[AC_nz] = nz;
|
||||
acUpdateBuiltinParams(&info);
|
||||
printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
const TestType test = TEST_STRONG_SCALING;
|
||||
if (test == TEST_WEAK_SCALING)
|
||||
info.int_params[AC_nz] *= nprocs;
|
||||
if (test == TEST_WEAK_SCALING) {
|
||||
uint3_64 decomp = decompose(nprocs);
|
||||
info.int_params[AC_nx] *= decomp.x;
|
||||
info.int_params[AC_ny] *= decomp.y;
|
||||
info.int_params[AC_nz] *= decomp.z;
|
||||
}
|
||||
|
||||
/*
|
||||
AcMesh model, candidate;
|
||||
@@ -115,7 +175,7 @@ main(void)
|
||||
*/
|
||||
|
||||
// Percentiles
|
||||
const size_t num_iters = 100;
|
||||
const size_t num_iters = 1000;
|
||||
const double nth_percentile = 0.90;
|
||||
std::vector<double> results; // ms
|
||||
results.reserve(num_iters);
|
||||
|
@@ -5,5 +5,5 @@ find_package(OpenMP)
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
add_executable(bwtest main.c)
|
||||
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
|
||||
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
|
||||
target_compile_options(bwtest PRIVATE -O3)
|
||||
|
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include <cuda.h> // CUDA driver API
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include "timer_hires.h" // From src/common
|
||||
@@ -13,7 +14,13 @@
|
||||
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
|
||||
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
|
||||
|
||||
#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
|
||||
#define errchk(x) \
|
||||
{ \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "errchk(%s) failed", #x); \
|
||||
assert(x); \
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
Findings:
|
||||
@@ -56,6 +63,18 @@ allocDevice(const size_t bytes)
|
||||
static uint8_t*
|
||||
allocDevicePinned(const size_t bytes)
|
||||
{
|
||||
#define USE_CUDA_DRIVER_PINNING (1)
|
||||
#if USE_CUDA_DRIVER_PINNING
|
||||
uint8_t* arr = allocDevice(bytes);
|
||||
|
||||
unsigned int flag = 1;
|
||||
CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
|
||||
(CUdeviceptr)arr);
|
||||
|
||||
errchk(retval == CUDA_SUCCESS);
|
||||
return arr;
|
||||
|
||||
#else
|
||||
uint8_t* arr;
|
||||
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||
@@ -65,8 +84,24 @@ allocDevicePinned(const size_t bytes)
|
||||
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||
errchk(retval == cudaSuccess);
|
||||
return arr;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
static uint8_t*
|
||||
allocDevicePinned(const size_t bytes)
|
||||
{
|
||||
uint8_t* arr;
|
||||
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||
errchk(retval == cudaSuccess);
|
||||
return arr;
|
||||
}*/
|
||||
|
||||
static void
|
||||
freeDevice(uint8_t* arr)
|
||||
{
|
||||
@@ -239,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst)
|
||||
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
|
||||
{
|
||||
@@ -299,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
|
||||
uint8_t* hsrc, uint8_t* ddst)
|
||||
measurebw2(const char* msg, const size_t bytes,
|
||||
void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
|
||||
uint8_t* hsrc, uint8_t* ddst)
|
||||
{
|
||||
const size_t num_samples = 100;
|
||||
|
||||
@@ -386,8 +420,8 @@ main(void)
|
||||
measurebw("Bidirectional bandwidth, twoway (Host)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Host)", //
|
||||
2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
//measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
|
||||
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
// measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
|
||||
// 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
||||
|
||||
freeHost(src);
|
||||
@@ -406,11 +440,12 @@ main(void)
|
||||
measurebw("Bidirectional bandwidth, twoway (Device)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device)", //
|
||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
//measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
|
||||
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
// measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
|
||||
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
|
||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
|
||||
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
|
||||
dst);
|
||||
|
||||
freeDevice(src);
|
||||
freeDevice(dst);
|
||||
@@ -428,7 +463,7 @@ main(void)
|
||||
measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
|
||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
|
||||
freeDevice(src);
|
||||
freeDevice(dst);
|
||||
@@ -444,7 +479,8 @@ main(void)
|
||||
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
||||
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
||||
|
||||
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
|
||||
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
|
||||
ddst);
|
||||
|
||||
freeDevice(dsrc);
|
||||
freeDevice(ddst);
|
||||
@@ -462,7 +498,8 @@ main(void)
|
||||
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
||||
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
||||
|
||||
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
|
||||
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
|
||||
hsrc, ddst);
|
||||
|
||||
freeDevice(dsrc);
|
||||
freeDevice(ddst);
|
||||
|
@@ -29,6 +29,7 @@ main(void)
|
||||
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
|
||||
fprintf(fp, "#SBATCH -n %d\n", nprocs);
|
||||
fprintf(fp, "#SBATCH -N %d\n", nodes);
|
||||
fprintf(fp, "#SBATCH --exclusive\n");
|
||||
|
||||
// Modules
|
||||
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
||||
@@ -36,8 +37,14 @@ main(void)
|
||||
|
||||
// Profile and run
|
||||
fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
||||
fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
|
||||
nprocs);
|
||||
|
||||
const int nx = 256; // max size 1792;
|
||||
const int ny = nx;
|
||||
const int nz = nx;
|
||||
fprintf(fp,
|
||||
//"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
|
||||
//"%d\n",
|
||||
"srun ./benchmark %d %d %d\n", nx, ny, nz);
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
Reference in New Issue
Block a user