latest results to MPI Pack post
This commit is contained in:
Binary file not shown.
Before Width: | Height: | Size: 47 KiB After Width: | Height: | Size: 35 KiB |
Binary file not shown.
Before Width: | Height: | Size: 39 KiB After Width: | Height: | Size: 37 KiB |
@@ -1,6 +1,5 @@
|
||||
+++
|
||||
title = "MPI_Pack performance on GPUs"
|
||||
subtitle = ""
|
||||
title = "Improving MPI_Pack performance in CUDA-aware MPI"
|
||||
date = 2020-10-06T00:00:00
|
||||
lastmod = 2020-10-06T00:00:00
|
||||
draft = false
|
||||
@@ -41,11 +40,11 @@ categories = []
|
||||
|
||||
+++
|
||||
|
||||
*this post is still a work in progress*
|
||||
|
||||
# Abstract
|
||||
|
||||
Roll your own MPI packing and unpacking on GPUs for a **10,000x** speedup.
|
||||
I am working on an MPI wrapper library to integrate these changes (and other research results) into existing MPI code with no source changes.
|
||||
Please contact me if you are interested in trying it out.
|
||||
|
||||
# Background
|
||||
|
||||
@@ -185,46 +184,68 @@ Each contiguous row is a block, and we provide the library function with an arra
|
||||
|
||||
**kernel**
|
||||
|
||||
The final comparison is a custom GPU kernel that can directly read from the source memory and compute the propery offsets in the device buffer, consistent with `MPI_Pack`.
|
||||
The final comparison is a custom GPU kernel that can directly read from the source memory and compute the proper offsets in the device buffer, consistent with `MPI_Pack`.
|
||||
|
||||
A key component of the performance of the kernel is the chosen block and grid dimensions.
|
||||
The block is sized to distribute a chosen number of threads across the X, Y, and Z dimension, filling X then Y, then Z until the available threads are exhausted.
|
||||
The grid dimension is then used to tile the blocks across the entire 3D region to be packed.
|
||||
Without such intelligent sizing, each thread will execute many loop iterations, drastically reducing performance.
|
||||
|
||||
Further improvement is gained by specializing the kernel call according to the blockLength.
|
||||
Groups of sequential X-threads collaborate to load the `blockLength`-long chunks.
|
||||
If the blockLength can be evenly divided by a larger word size, that size is used.
|
||||
|
||||
```c++
|
||||
__device__ void grid_pack(void *__restrict__ dst, const cudaPitchedPtr src,
|
||||
const Dim3 srcPos, // logical offset into the 3D region, in elements
|
||||
const Dim3 srcExtent, // logical extent of the 3D region to pack, in elements
|
||||
const size_t elemSize // size of the element in bytes
|
||||
template <unsigned N>
|
||||
__global__ static void pack_bytes_n(
|
||||
void *__restrict__ outbuf, int position, const void *__restrict__ inbuf,
|
||||
const int incount,
|
||||
unsigned blockLength, // block length (B)
|
||||
unsigned count0, // count of inner blocks in a group
|
||||
unsigned stride0, // stride (B) between start of inner blocks in group
|
||||
unsigned count1, // number of block groups
|
||||
unsigned stride1 // stride (B) between start of block groups
|
||||
) {
|
||||
|
||||
const char *__restrict__ sp = static_cast<char *>(src.ptr);
|
||||
assert(blockLength % N == 0); // N should evenly divide block length
|
||||
|
||||
const unsigned int tz = blockDim.z * blockIdx.z + threadIdx.z;
|
||||
const unsigned int ty = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
const unsigned int tx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
for (unsigned int zo = tz; zo < srcExtent.z; zo += blockDim.z * gridDim.z) {
|
||||
unsigned int zi = zo + srcPos.z;
|
||||
for (unsigned int yo = ty; yo < srcExtent.y; yo += blockDim.y * gridDim.y) {
|
||||
unsigned int yi = yo + srcPos.y;
|
||||
for (unsigned int xo = tx; xo < srcExtent.x; xo += blockDim.x * gridDim.x) {
|
||||
unsigned int xi = xo + srcPos.x;
|
||||
char *__restrict__ op = reinterpret_cast<char *>(outbuf);
|
||||
const char *__restrict__ ip = reinterpret_cast<const char *>(inbuf);
|
||||
|
||||
// logical offset of packed output
|
||||
const size_t oi = zo * srcExtent.y * srcExtent.x + yo * srcExtent.x + xo;
|
||||
// printf("[xo, yo, zo]->oi = [%u, %u, %u]->%lu\n", xo, yo, zo, oi);
|
||||
// byte offset of input
|
||||
const size_t bi = zi * src.ysize * src.pitch + yi * src.pitch + xi * elemSize;
|
||||
// printf("[xi, yi, zi]->bi = [%u, %u, %u]->%lu\n", xi, yi, zi, bi);
|
||||
if (1 == elemSize) {
|
||||
char v = *reinterpret_cast<const char *>(sp + bi);
|
||||
reinterpret_cast<char *>(dst)[oi] = v;
|
||||
} else if (4 == elemSize) {
|
||||
uint32_t v = *reinterpret_cast<const uint32_t *>(sp + bi);
|
||||
reinterpret_cast<uint32_t *>(dst)[oi] = v;
|
||||
} else if (8 == elemSize) {
|
||||
uint64_t v = *reinterpret_cast<const uint64_t *>(sp + bi);
|
||||
reinterpret_cast<uint64_t *>(dst)[oi] = v;
|
||||
} else {
|
||||
char *pDst = reinterpret_cast<char *>(dst);
|
||||
memcpy(&pDst[oi * elemSize], sp + bi, elemSize);
|
||||
for (int i = 0; i < incount; ++i) {
|
||||
char *__restrict__ dst = op + position + i * count1 * count0 * blockLength;
|
||||
const char *__restrict__ src = ip + i * stride1 * count1 * stride0 * count0;
|
||||
|
||||
for (unsigned z = tz; z < count1; z += gridDim.z * blockDim.z) {
|
||||
for (unsigned y = ty; y < count0; y += gridDim.y * blockDim.y) {
|
||||
for (unsigned x = tx; x < blockLength / N;
|
||||
x += gridDim.x * blockDim.x) {
|
||||
unsigned bo = z * count0 * blockLength + y * blockLength + x * N;
|
||||
unsigned bi = z * stride1 + y * stride0 + x * N;
|
||||
// printf("%u -> %u\n", bi, bo);
|
||||
|
||||
if (N == 1) {
|
||||
dst[bo] = src[bi];
|
||||
} else if (N == 2) {
|
||||
uint16_t *__restrict__ d = reinterpret_cast<uint16_t *>(dst + bo);
|
||||
const uint16_t *__restrict__ s =
|
||||
reinterpret_cast<const uint16_t *>(src + bi);
|
||||
*d = *s;
|
||||
} else if (N == 4) {
|
||||
uint32_t *__restrict__ d = reinterpret_cast<uint32_t *>(dst + bo);
|
||||
const uint32_t *__restrict__ s =
|
||||
reinterpret_cast<const uint32_t *>(src + bi);
|
||||
*d = *s;
|
||||
} else if (N == 8) {
|
||||
uint64_t *__restrict__ d = reinterpret_cast<uint64_t *>(dst + bo);
|
||||
const uint64_t *__restrict__ s =
|
||||
reinterpret_cast<const uint64_t *>(src + bi);
|
||||
*d = *s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -237,7 +258,7 @@ __device__ void grid_pack(void *__restrict__ dst, const cudaPitchedPtr src,
|
||||
Each operation is executed 5 times, and the trimean is reported.
|
||||
For MPI_Pack, the measured time begins when MPI_Pack is called and ends after a subsequent `cudaDeviceSynchronize`.
|
||||
This is necessary as the CUDA-aware MPI_Pack may be asynchronous w.r.t. the CPU, as the implementation may insert future CUDA operations into the same stream to ensure their ordering.
|
||||
For the GPU kernel, the measured time begins at the kernel invocation and ends after a subsequent `cudaDeviceSynchronize`.
|
||||
The GPU time includes any time required to set up the kernel launch.
|
||||
|
||||
The following table describes the evaluation system
|
||||
|
||||
@@ -279,14 +300,14 @@ And for for openMPI-4.0.5:
|
||||
# Discussion
|
||||
|
||||
In most cases, the GPU kernel is **100-10,000x** faster than OpenMPI and **30x-3000x** faster than mvapich.
|
||||
|
||||
It matches the underlying MPI_Pack performance when the region to be packed is already contiguous.
|
||||
|
||||
|
||||
The two images below are taken from the Nsight Systems profiling tool.
|
||||
It creates a timeline on which GPU activity and CUDA calls are shown.
|
||||
For mvapich, each contiguous block is transferred using a call to cudaMemcpyAsync.
|
||||
This means that one cudaMemcpyAsync call is generated for each row.
|
||||
As we can see, that actuall call takes much more time than the corresponding GPU activity, causing the slow performance.
|
||||
For mvapich, each contiguous block is transferred using a call to `cudaMemcpyAsync`.
|
||||
This means that one `cudaMemcpyAsync` call is generated for each `blockLength` chunk.
|
||||
As we can see, that actual call takes much more time than the corresponding GPU activity, causing the slow performance.
|
||||
|
||||
|
||||
**strided mvapich**
|
||||
@@ -297,9 +318,7 @@ The OpenMPI situation is much worse - for some reason, the CPU is forced to sync
|
||||
**strided openmpi**
|
||||

|
||||
|
||||
When the nature of the copy causes the source memory to be contiguous, there is a single cudaMemcpyAsync call, which is much faster than the GPU kernel.
|
||||
(5 copies are shown due to 5 measurement iterations).
|
||||
Here, we can also see that OpenMPI's unnecessary synchronizations degrade performance by causing the GPU to be idle.
|
||||
When the nature of the copy causes the source memory to be contiguous, there is a single cudaMemcpyAsync call, which makes as effective use of the GPU bandwidth as the GPU kernel does.
|
||||
|
||||
**contiguous mvapich**
|
||||

|
||||
@@ -310,7 +329,6 @@ Here, we can also see that OpenMPI's unnecessary synchronizations degrade perfor
|
||||
|
||||
The GPU kernel performance is limited by inefficient use of DRAM bandwidth.
|
||||
For example, when loading a single byte separated by hundreds of bytes, most of each 128B cache line is wasted.
|
||||
For the contiguous 1 MiB transfers, where the kernel is slower, performance is limited by each 32-thread warp collaboartively loading 32B.
|
||||
|
||||
# Conclusion
|
||||
|
||||
|
Reference in New Issue
Block a user