Merged the newest MPI changes

This commit is contained in:
jpekkila
2020-05-30 19:18:46 +03:00
12 changed files with 1065 additions and 351 deletions

View File

@@ -89,6 +89,8 @@ if (BUILD_SAMPLES)
add_subdirectory(samples/cpptest)
add_subdirectory(samples/mpitest)
add_subdirectory(samples/benchmark)
add_subdirectory(samples/bwtest)
add_subdirectory(samples/genbenchmarkscripts)
endif()
if (BUILD_STANDALONE)

View File

@@ -89,19 +89,45 @@ main(void)
}
}*/
/*
// Basic
const size_t num_iters = 100;
// Warmup
for (size_t i = 0; i < 10; ++i)
for (size_t i = 0; i < num_iters / 10; ++i)
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
// Benchmark
Timer t;
const AcReal dt = FLT_EPSILON;
const AcReal dt = FLT_EPSILON;
acGridSynchronizeStream(STREAM_ALL);
timer_reset(&t);
acGridSynchronizeStream(STREAM_ALL);
for (size_t i = 0; i < num_iters; ++i)
acGridIntegrate(STREAM_DEFAULT, dt);
acGridSynchronizeStream(STREAM_ALL);
if (!pid)
timer_diff_print(t);
acGridSynchronizeStream(STREAM_ALL);
*/
// Percentiles
const size_t num_iters = 100;
const double nth_percentile = 0.90;
std::vector<double> results; // ms
results.reserve(num_iters);
// Warmup
for (size_t i = 0; i < num_iters / 10; ++i)
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
// Benchmark
Timer t;
const AcReal dt = FLT_EPSILON;
for (size_t i = 0; i < num_iters; ++i) {
acGridSynchronizeStream(STREAM_ALL);
timer_reset(&t);
@@ -109,9 +135,9 @@ main(void)
acGridIntegrate(STREAM_DEFAULT, dt);
acGridSynchronizeStream(STREAM_ALL);
results.push_back(timer_diff_nsec(t) / 1e6);
acGridSynchronizeStream(STREAM_ALL);
}
// Write benchmark to file
if (!pid) {
std::sort(results.begin(), results.end(),
[](const double& a, const double& b) { return a < b; });
@@ -137,6 +163,48 @@ main(void)
fclose(fp);
}
/*
const size_t num_iters = 100;
const double nth_percentile = 0.90;
std::vector<double> results; // ms
results.reserve(num_iters);
for (size_t i = 0; i < num_iters; ++i) {
acGridSynchronizeStream(STREAM_ALL);
timer_reset(&t);
acGridSynchronizeStream(STREAM_ALL);
acGridIntegrate(STREAM_DEFAULT, dt);
acGridSynchronizeStream(STREAM_ALL);
results.push_back(timer_diff_nsec(t) / 1e6);
}
// Write benchmark to file
if (!pid) {
std::sort(results.begin(), results.end(),
[](const double& a, const double& b) { return a < b; });
fprintf(stdout,
"Integration step time %g ms (%gth "
"percentile)--------------------------------------\n",
results[nth_percentile * num_iters], 100 * nth_percentile);
char path[4096] = "";
if (test == TEST_STRONG_SCALING)
strncpy(path, "strong_scaling.csv", sizeof(path));
else if (test == TEST_WEAK_SCALING)
strncpy(path, "weak_scaling.csv", sizeof(path));
else
ERROR("Invalid test type");
FILE* fp = fopen(path, "a");
ERRCHK_ALWAYS(fp);
// Format
// nprocs, measured (ms)
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
fclose(fp);
}*/
acGridQuit();
MPI_Finalize();
return EXIT_SUCCESS;

View File

@@ -0,0 +1,9 @@
cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking
find_package(MPI)
find_package(OpenMP)
find_package(CUDAToolkit)
add_executable(bwtest main.c)
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
target_compile_options(bwtest PRIVATE -O3)

486
samples/bwtest/main.c Normal file
View File

@@ -0,0 +1,486 @@
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <cuda_runtime_api.h>
#include "timer_hires.h" // From src/common
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
/*
Findings:
- MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
- Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
- For internode one should use pinned memory (RDMA is staged through pinned, gives full
network speed iff pinned)
- Both the sending and receiving arrays must be pinned to see performance improvement
in internode comm
*/
static uint8_t*
allocHost(const size_t bytes)
{
uint8_t* arr = malloc(bytes);
assert(arr);
return arr;
}
static void
freeHost(uint8_t* arr)
{
free(arr);
}
static uint8_t*
allocDevice(const size_t bytes)
{
uint8_t* arr;
// Standard (20 GiB/s internode, 85 GiB/s intranode)
const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
// const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
assert(retval == cudaSuccess);
return arr;
}
static uint8_t*
allocDevicePinned(const size_t bytes)
{
uint8_t* arr;
// Standard (20 GiB/s internode, 85 GiB/s intranode)
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
assert(retval == cudaSuccess);
return arr;
}
static void
freeDevice(uint8_t* arr)
{
cudaFree(arr);
}
static void
sendrecv_blocking(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int nfront = (pid + 1) % nprocs;
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
if (!pid) {
MPI_Status status;
MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
}
else {
MPI_Status status;
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
}
}
static void
sendrecv_nonblocking(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int nfront = (pid + 1) % nprocs;
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
MPI_Request recv_request, send_request;
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request);
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request);
MPI_Status status;
MPI_Wait(&recv_request, &status);
MPI_Wait(&send_request, &status);
}
static void
sendrecv_twoway(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int nfront = (pid + 1) % nprocs;
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
MPI_Status status;
MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback,
MPI_COMM_WORLD, &status);
}
static void
sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Request recv_requests[nprocs], send_requests[nprocs];
for (int i = 1; i < nprocs; ++i) {
int nfront = (pid + i) % nprocs;
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
}
for (int i = 1; i < nprocs; ++i) {
MPI_Status status;
MPI_Wait(&recv_requests[i], &status);
MPI_Wait(&send_requests[i], &status);
}
}
static void
sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Request recv_requests[nprocs], send_requests[nprocs];
for (int i = 1; i < nprocs; ++i) {
int nfront = (pid + i) % nprocs;
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
}
static bool error_shown = false;
if (!pid && !error_shown) {
fprintf(stderr, "\tWARNING: make sure you init MPI_Init_thread for OpenMP support (no "
"supported on puhti atm "
"2020-04-05\n");
error_shown = true;
}
#pragma omp parallel for
for (int i = 1; i < nprocs; ++i) {
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
MPI_Status status;
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &status);
}
for (int i = 1; i < nprocs; ++i) {
MPI_Status status;
MPI_Wait(&send_requests[i], &status);
}
}
static void
sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
static uint8_t* src_pinned = NULL;
static uint8_t* dst_pinned = NULL;
if (!src_pinned)
src_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
if (!dst_pinned)
dst_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
int devices_per_node = -1;
cudaGetDeviceCount(&devices_per_node);
const int node_id = pid / devices_per_node;
MPI_Request recv_requests[nprocs], send_requests[nprocs];
for (int i = 1; i < nprocs; ++i) {
int nfront = (pid + i) % nprocs;
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
if (nback / devices_per_node != pid / devices_per_node) // Not on the same node
MPI_Irecv(dst_pinned, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD,
&recv_requests[i]);
else
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
if (nfront / devices_per_node != pid / devices_per_node) // Not on the same node
MPI_Isend(src_pinned, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD,
&send_requests[i]);
else
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
}
for (int i = 1; i < nprocs; ++i) {
MPI_Status status;
MPI_Wait(&recv_requests[i], &status);
MPI_Wait(&send_requests[i], &status);
}
}
static void
send_d2h(const uint8_t* src, uint8_t* dst)
{
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
}
static void
send_h2d(const uint8_t* src, uint8_t* dst)
{
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
}
static void
sendrecv_d2h2d(const uint8_t* dsrc, uint8_t* hdst, const uint8_t* hsrc, uint8_t* ddst)
{
cudaStream_t d2h, h2d;
cudaStreamCreate(&d2h);
cudaStreamCreate(&h2d);
cudaMemcpyAsync(hdst, dsrc, BLOCK_SIZE, cudaMemcpyDeviceToHost, d2h);
cudaMemcpyAsync(ddst, hsrc, BLOCK_SIZE, cudaMemcpyHostToDevice, h2d);
cudaStreamSynchronize(d2h);
cudaStreamSynchronize(h2d);
cudaStreamDestroy(d2h);
cudaStreamDestroy(h2d);
}
#define PRINT \
if (!pid) \
printf
static void
measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
uint8_t* dst)
{
const size_t num_samples = 100;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
PRINT("%s\n", msg);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("\tWarming up... ");
for (size_t i = 0; i < num_samples / 10; ++i)
sendrecv(src, dst);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("Done\n");
PRINT("\tBandwidth... ");
fflush(stdout);
Timer t;
MPI_Barrier(MPI_COMM_WORLD);
timer_reset(&t);
MPI_Barrier(MPI_COMM_WORLD);
for (size_t i = 0; i < num_samples; ++i)
sendrecv(src, dst);
MPI_Barrier(MPI_COMM_WORLD);
const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
MPI_Barrier(MPI_COMM_WORLD);
}
static void
measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(const uint8_t*, uint8_t*, const uint8_t*, uint8_t*), const uint8_t* dsrc, uint8_t* hdst,
const uint8_t* hsrc, uint8_t* ddst)
{
const size_t num_samples = 100;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
PRINT("%s\n", msg);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("\tWarming up... ");
for (size_t i = 0; i < num_samples / 10; ++i)
sendrecv(dsrc, hdst, hsrc, ddst);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("Done\n");
PRINT("\tBandwidth... ");
fflush(stdout);
Timer t;
MPI_Barrier(MPI_COMM_WORLD);
timer_reset(&t);
MPI_Barrier(MPI_COMM_WORLD);
for (size_t i = 0; i < num_samples; ++i)
sendrecv(dsrc, hdst, hsrc, ddst);
MPI_Barrier(MPI_COMM_WORLD);
const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
MPI_Barrier(MPI_COMM_WORLD);
}
int
main(void)
{
MPI_Init(NULL, NULL);
// int provided;
// MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
// assert(provided >= MPI_THREAD_MULTIPLE);
// Disable stdout buffering
setbuf(stdout, NULL);
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
assert(nprocs >= 2); // Require at least one neighbor
MPI_Barrier(MPI_COMM_WORLD);
if (!pid) {
printf("Do we have threads? The following should not be ordered (unless very lucky)\n");
#pragma omp parallel for
for (int i = 0; i < 10; ++i)
printf("%d, ", i);
printf("\n");
}
MPI_Barrier(MPI_COMM_WORLD);
int devices_per_node = -1;
cudaGetDeviceCount(&devices_per_node);
const int device_id = pid % devices_per_node;
cudaSetDevice(device_id);
printf("Process %d of %d running.\n", pid, nprocs);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
#if 1
{
uint8_t* src = allocHost(BLOCK_SIZE);
uint8_t* dst = allocHost(BLOCK_SIZE);
measurebw("Unidirectional bandwidth, blocking (Host)", //
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
measurebw("Bidirectional bandwidth, async (Host)", //
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
measurebw("Bidirectional bandwidth, twoway (Host)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Host)", //
2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
//measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
// 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
freeHost(src);
freeHost(dst);
}
PRINT("\n------------------------\n");
{
uint8_t* src = allocDevice(BLOCK_SIZE);
uint8_t* dst = allocDevice(BLOCK_SIZE);
measurebw("Unidirectional bandwidth, blocking (Device)", //
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
measurebw("Bidirectional bandwidth, async (Device)", //
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
measurebw("Bidirectional bandwidth, twoway (Device)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device)", //
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
//measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
freeDevice(src);
freeDevice(dst);
}
PRINT("\n------------------------\n");
{
uint8_t* src = allocDevicePinned(BLOCK_SIZE);
uint8_t* dst = allocDevicePinned(BLOCK_SIZE);
measurebw("Unidirectional bandwidth, blocking (Device, pinned)", //
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
measurebw("Bidirectional bandwidth, async (Device, pinned)", //
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
freeDevice(src);
freeDevice(dst);
}
PRINT("\n------------------------\n");
{
uint8_t* hsrc = allocHost(BLOCK_SIZE);
uint8_t* hdst = allocHost(BLOCK_SIZE);
uint8_t* dsrc = allocDevice(BLOCK_SIZE);
uint8_t* ddst = allocDevice(BLOCK_SIZE);
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
freeDevice(dsrc);
freeDevice(ddst);
freeHost(hsrc);
freeHost(hdst);
}
PRINT("\n------------------------\n");
{
uint8_t* hsrc = allocHost(BLOCK_SIZE);
uint8_t* hdst = allocHost(BLOCK_SIZE);
uint8_t* dsrc = allocDevicePinned(BLOCK_SIZE);
uint8_t* ddst = allocDevicePinned(BLOCK_SIZE);
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
freeDevice(dsrc);
freeDevice(ddst);
freeHost(hsrc);
freeHost(hdst);
}
PRINT("\n------------------------\n");
#else
{ // Final run for easy identification with the profiler
uint8_t* src = allocDevice(BLOCK_SIZE);
uint8_t* dst = allocDevice(BLOCK_SIZE);
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
freeDevice(src);
freeDevice(dst);
}
#endif
MPI_Finalize();
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,8 @@
add_executable(genbenchmarkscripts main.c)
add_custom_command(
TARGET genbenchmarkscripts POST_BUILD
COMMAND genbenchmarkscripts
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
COMMENT "Generating benchmark scripts"
)

View File

@@ -0,0 +1,46 @@
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int
main(void)
{
const int max_nprocs = 128;
for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
char filename[4096];
sprintf(filename, "benchmark_%d.sh", nprocs);
FILE* fp = fopen(filename, "w");
assert(fp);
// Boilerplate
fprintf(fp, "#!/bin/bash\n");
fprintf(fp, "#BATCH --job-name=astaroth\n");
fprintf(fp, "#SBATCH --account=project_2000403\n");
fprintf(fp, "#SBATCH --time=00:14:59\n");
fprintf(fp, "#SBATCH --mem=32000\n");
fprintf(fp, "#SBATCH --partition=gpu\n");
// nprocs, nodes, gpus
const int max_gpus_per_node = 4;
const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
const int nodes = (int)ceil((double)nprocs / max_gpus_per_node);
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
fprintf(fp, "#SBATCH -n %d\n", nprocs);
fprintf(fp, "#SBATCH -N %d\n", nodes);
// Modules
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
// Profile and run
fprintf(fp, "mkdir -p profile_%d\n", nprocs);
fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
nprocs);
fclose(fp);
}
return EXIT_SUCCESS;
}

View File

@@ -74,7 +74,8 @@ main(void)
int
main(void)
{
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with cmake -DMPI_ENABLED=ON .. to enable.\n");
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
"cmake -DMPI_ENABLED=ON .. to enable.\n");
return EXIT_FAILURE;
}
#endif // AC_MPI_ENABLES

View File

@@ -0,0 +1,80 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
Running: mpirun -np <num processes> <executable>
*/
#include "astaroth.h"
#include "astaroth_utils.h"
#include <mpi.h>
int
main(void)
{
MPI_Init(NULL, NULL);
int nprocs, pid;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
// CPU alloc
AcMeshInfo info;
acLoadConfig(AC_DEFAULT_CONFIG, &info);
info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx];
info.real_params[AC_inv_dsy] = AcReal(1.0) / info.real_params[AC_dsy];
info.real_params[AC_inv_dsz] = AcReal(1.0) / info.real_params[AC_dsz];
info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];
AcMesh model, candidate;
if (pid == 0) {
acMeshCreate(info, &model);
acMeshCreate(info, &candidate);
acMeshRandomize(&model);
acMeshRandomize(&candidate);
}
// GPU alloc & compute
Grid grid;
acGridCreateMPI(info, &grid);
acGridLoadMeshMPI(grid, STREAM_DEFAULT, model);
acGridSynchronizeStreamMPI(grid, STREAM_ALL);
acGridIntegrateMPI(grid, FLT_EPSILON);
acGridSynchronizeStreamMPI(grid, STREAM_ALL);
acGridSynchronizeMeshMPI(grid, STREAM_DEFAULT);
acGridSynchronizeStreamMPI(grid, STREAM_ALL);
acGridStoreMeshMPI(grid, STREAM_DEFAULT, &candidate);
acGridSynchronizeStreamMPI(grid, STREAM_ALL);
acGridDestroyMPI(grid);
// Verify
if (pid == 0) {
acModelIntegrateStep(model, FLT_EPSILON);
acMeshApplyPeriodicBounds(&model);
acVerifyMesh(model, candidate);
acMeshDestroy(&model);
acMeshDestroy(&candidate);
}
MPI_Finalize();
return EXIT_SUCCESS;
}

View File

@@ -105,7 +105,8 @@ operator+(const int3& a, const int3& b)
return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
}
static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
static HOST_DEVICE_INLINE int3
operator*(const int3& a, const int3& b)
{
return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
}
@@ -144,12 +145,20 @@ operator-=(AcReal3& lhs, const AcReal3& rhs)
lhs.z -= rhs.z;
}
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
static HOST_DEVICE_INLINE int3
operator*(const int& a, const int3& b)
{
return (int3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal3
operator*(const AcReal& a, const AcReal3& b)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
static HOST_DEVICE_INLINE AcReal3
operator*(const AcReal3& b, const AcReal& a)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}

View File

@@ -1,11 +1,14 @@
find_package(CUDAToolkit)
## Astaroth Core
add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart)
## Options
if (MPI_ENABLED)
find_package(MPI)
target_link_libraries(astaroth_core MPI::MPI_CXX)
find_package(OpenMP)
target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
endif()
if (MULTIGPU_ENABLED)

View File

@@ -460,64 +460,139 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
}
#if AC_MPI_ENABLED
/**
Quick overview of the MPI implementation:
The halo is partitioned into segments. The first coordinate of a segment is b0.
The array containing multiple b0s is called... "b0s".
Each b0 maps to an index in the computational domain of some neighboring process a0.
We have a0 = mod(b0 - nghost, nn) + nghost.
Intuitively, we
1) Transform b0 into a coordinate system where (0, 0, 0) is the first index in
the comp domain.
2) Wrap the transformed b0 around nn (comp domain)
3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index
in the ghost zone
struct PackedData is used for packing and unpacking. Holds the actual data in
the halo partition
struct CommData holds multiple PackedDatas for sending and receiving halo
partitions
struct Grid contains information about the local GPU device, decomposition, the
total mesh dimensions and CommDatas
Basic steps:
1) Distribute the mesh among ranks
2) Integrate & communicate
- start inner integration and at the same time, pack halo data and send it to neighbors
- once all halo data has been received, unpack and do outer integration
- sync and start again
3) Gather the mesh to rank 0 for postprocessing
*/
#include <mpi.h>
static int
#include <stdint.h>
typedef struct {
uint64_t x, y, z;
} uint3_64;
static uint3_64
operator+(const uint3_64& a, const uint3_64& b)
{
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
}
static int3
make_int3(const uint3_64 a)
{
return (int3){(int)a.x, (int)a.y, (int)a.z};
}
static uint64_t
mod(const int a, const int b)
{
const int r = a % b;
return r < 0 ? r + b : r;
}
static int
getPid(const int3 pid, const int3 decomposition)
static uint3_64
morton3D(const uint64_t pid)
{
return mod(pid.x, decomposition.x) + //
mod(pid.y, decomposition.y) * decomposition.x + //
mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
uint64_t i, j, k;
i = j = k = 0;
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << 3 * bit;
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
}
return (uint3_64){i, j, k};
}
static int3
getPid3D(const int pid, const int3 decomposition)
static uint64_t
morton1D(const uint3_64 pid)
{
const int3 pid3d = (int3){
mod(pid, decomposition.x),
mod(pid / decomposition.x, decomposition.y),
(pid / (decomposition.x * decomposition.y)),
uint64_t i = 0;
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << bit;
i |= ((pid.x & mask) << 0) << 2 * bit;
i |= ((pid.y & mask) << 1) << 2 * bit;
i |= ((pid.z & mask) << 2) << 2 * bit;
}
return i;
}
static uint3_64
decompose(const uint64_t target)
{
// This is just so beautifully elegant. Complex and efficient decomposition
// in just one line of code.
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
return p;
}
static uint3_64
wrap(const int3 i, const uint3_64 n)
{
return (uint3_64){
mod(i.x, n.x),
mod(i.y, n.y),
mod(i.z, n.z),
};
return pid3d;
}
static int
getPid(const int3 pid_raw, const uint3_64 decomp)
{
const uint3_64 pid = wrap(pid_raw, decomp);
return (int)morton1D(pid);
}
static int3
decompose(const int target)
getPid3D(const uint64_t pid, const uint3_64 decomp)
{
if (target == 16)
return (int3){4, 2, 2};
if (target == 32)
return (int3){4, 4, 2};
if (target == 128)
return (int3){8, 4, 4};
if (target == 256)
return (int3){8, 8, 4};
const uint3_64 pid3D = morton3D(pid);
ERRCHK_ALWAYS(getPid(make_int3(pid3D), decomp) == (int)pid);
return (int3){(int)pid3D.x, (int)pid3D.y, (int)pid3D.z};
}
int decomposition[] = {1, 1, 1};
/** Assumes that contiguous pids are on the same node and there is one process per GPU. */
static inline bool
onTheSameNode(const uint64_t pid_a, const uint64_t pid_b)
{
int devices_per_node = -1;
cudaGetDeviceCount(&devices_per_node);
int axis = 0;
while (decomposition[0] * decomposition[1] * decomposition[2] < target) {
++decomposition[axis];
axis = (axis + 1) % 3;
}
const uint64_t node_a = pid_a / devices_per_node;
const uint64_t node_b = pid_b / devices_per_node;
const int found = decomposition[0] * decomposition[1] * decomposition[2];
if (found != target) {
fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
fprintf(stderr, "Target nprocs: %d. Next allowed: %d\n", target, found);
ERROR("Invalid nprocs");
return (int3){-1, -1, -1};
}
else {
return (int3){decomposition[0], decomposition[1], decomposition[2]};
}
return node_a == node_b;
}
static PackedData
@@ -530,12 +605,18 @@ acCreatePackedData(const int3 dims)
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
// slower than pinned (38 ms vs. 125 ms)
return data;
}
static AcResult
acDestroyPackedData(PackedData* data)
{
cudaFree(data->data_pinned);
data->dims = (int3){-1, -1, -1};
cudaFree(data->data);
data->data = NULL;
@@ -558,6 +639,16 @@ acCreatePackedDataHost(const int3 dims)
return data;
}
static AcResult
acDestroyPackedDataHost(PackedData* data)
{
data->dims = (int3){-1, -1, -1};
free(data->data);
data->data = NULL;
return AC_SUCCESS;
}
static void
acTransferPackedDataToHost(const Device device, const cudaStream_t stream, const PackedData ddata,
PackedData* hdata)
@@ -579,21 +670,38 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice, stream));
}
static AcResult
acDestroyPackedDataHost(PackedData* data)
{
data->dims = (int3){-1, -1, -1};
free(data->data);
data->data = NULL;
return AC_SUCCESS;
}
#endif // MPI_GPUDIRECT_DISABLED
static void
acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
{
cudaSetDevice(device->id);
// TODO sync stream
ddata->pinned = true;
const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream));
}
static void
acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
{
if (!ddata->pinned) // Unpin iff the data was pinned previously
return;
cudaSetDevice(device->id);
// TODO sync stream
ddata->pinned = false;
const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
}
// TODO: do with packed data
static AcResult
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
acDeviceDistributeMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
{
MPI_Barrier(MPI_COMM_WORLD);
printf("Distributing mesh...\n");
@@ -669,7 +777,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
// TODO: do with packed data
static AcResult
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
acDeviceGatherMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
{
MPI_Barrier(MPI_COMM_WORLD);
printf("Gathering mesh...\n");
@@ -803,7 +911,9 @@ acCreateCommData(const Device device, const int3 dims, const size_t count)
data.dsts_host[i] = acCreatePackedDataHost(dims);
#endif
cudaStreamCreate(&data.streams[i]);
int low_prio, high_prio;
cudaDeviceGetStreamPriorityRange(&low_prio, &high_prio);
cudaStreamCreateWithPriority(&data.streams[i], cudaStreamNonBlocking, high_prio);
}
return data;
@@ -849,12 +959,28 @@ acSyncCommData(const CommData data)
cudaStreamSynchronize(data.streams[i]);
}
static int3
mod(const int3 a, const int3 n)
{
return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
}
static void
acPackCommData(const Device device, const int3* a0s, CommData* data)
acPackCommData(const Device device, const int3* b0s, CommData* data)
{
cudaSetDevice(device->id);
for (size_t i = 0; i < data->count; ++i)
acKernelPackData(data->streams[i], device->vba, a0s[i], data->srcs[i]);
const int3 nn = (int3){
device->local_config.int_params[AC_nx],
device->local_config.int_params[AC_ny],
device->local_config.int_params[AC_nz],
};
const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
for (size_t i = 0; i < data->count; ++i) {
const int3 a0 = mod(b0s[i] - nghost, nn) + nghost;
acKernelPackData(data->streams[i], device->vba, a0, data->srcs[i]);
}
}
static void
@@ -884,10 +1010,31 @@ acTransferCommDataToDevice(const Device device, CommData* data)
}
#endif
static inline void
acPinCommData(const Device device, CommData* data)
{
cudaSetDevice(device->id);
for (size_t i = 0; i < data->count; ++i)
acPinPackedData(device, data->streams[i], &data->srcs[i]);
}
static void
acUnpinCommData(const Device device, CommData* data)
{
cudaSetDevice(device->id);
// Clear pin flags from src
for (size_t i = 0; i < data->count; ++i)
data->srcs[i].pinned = false;
// Transfer from pinned to gmem
for (size_t i = 0; i < data->count; ++i)
acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
}
static AcResult
acTransferCommData(const Device device, //
const int3* a0s, // Src idx inside comp. domain
const int3* b0s, // Dst idx inside bound zone
const int3* b0s, // Halo partition coordinates
CommData* data)
{
cudaSetDevice(device->id);
@@ -899,7 +1046,7 @@ acTransferCommData(const Device device, //
int nprocs, pid;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
const int3 decomp = decompose(nprocs);
const uint3_64 decomp = decompose(nprocs);
const int3 nn = (int3){
device->local_config.int_params[AC_nx],
@@ -907,80 +1054,55 @@ acTransferCommData(const Device device, //
device->local_config.int_params[AC_nz],
};
const int3 pid3d = getPid3D(pid, decomp);
const int3 dims = data->dims;
const size_t blockcount = data->count;
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
for (int k = -1; k <= 1; ++k) {
for (int j = -1; j <= 1; ++j) {
for (int i = -1; i <= 1; ++i) {
if (i == 0 && j == 0 && k == 0)
continue;
for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
const int3 neighbor = (int3){i, j, k};
const int3 b0 = b0s[b0_idx];
const int3 neighbor = (int3){
b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
};
const int npid = getPid(pid3d + neighbor, decomp);
const int3 a0 = a0s[a_idx];
// const int3 a1 = a0 + dims;
const int3 b0 = a0 - neighbor * nn;
// const int3 b1 = a1 - neighbor * nn;
if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
#if MPI_GPUDIRECT_DISABLED
PackedData dst = data->dsts_host[b_idx];
#else
PackedData dst = data->dsts[b_idx];
#endif
const int3 pid3d = getPid3D(pid, decomp);
MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp),
b_idx, MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
}
}
}
}
PackedData* dst = &data->dsts[b0_idx];
if (onTheSameNode(pid, npid)) {
MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
dst->pinned = false;
}
else {
MPI_Irecv(dst->data_pinned, count, datatype, npid, b0_idx, //
MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
dst->pinned = true;
}
}
for (int k = -1; k <= 1; ++k) {
for (int j = -1; j <= 1; ++j) {
for (int i = -1; i <= 1; ++i) {
if (i == 0 && j == 0 && k == 0)
continue;
for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
const int3 b0 = b0s[b0_idx];
const int3 neighbor = (int3){
b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
};
const int npid = getPid(pid3d - neighbor, decomp);
for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
const int3 neighbor = (int3){i, j, k};
const int3 a0 = a0s[a_idx];
// const int3 a1 = a0 + dims;
const int3 b0 = a0 - neighbor * nn;
// const int3 b1 = a1 - neighbor * nn;
if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
#if MPI_GPUDIRECT_DISABLED
PackedData src = data->srcs_host[a_idx];
#else
PackedData src = data->srcs[a_idx];
#endif
const int3 pid3d = getPid3D(pid, decomp);
cudaStreamSynchronize(data->streams[a_idx]);
MPI_Isend(src.data, count, datatype, getPid(pid3d + neighbor, decomp),
b_idx, MPI_COMM_WORLD, &data->send_reqs[b_idx]);
}
}
}
}
PackedData* src = &data->srcs[b0_idx];
if (onTheSameNode(pid, npid)) {
cudaStreamSynchronize(data->streams[b0_idx]);
MPI_Isend(src->data, count, datatype, npid, b0_idx, //
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
}
else {
acPinPackedData(device, data->streams[b0_idx], src);
cudaStreamSynchronize(data->streams[b0_idx]);
MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
}
}
@@ -999,7 +1121,7 @@ acTransferCommDataWait(const CommData data)
typedef struct {
Device device;
AcMesh submesh;
int3 decomposition;
uint3_64 decomposition;
bool initialized;
int3 nn;
@@ -1040,11 +1162,11 @@ acGridInit(const AcMeshInfo info)
printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
// Decompose
AcMeshInfo submesh_info = info;
const int3 decomposition = decompose(nprocs);
const int3 pid3d = getPid3D(pid, decomposition);
AcMeshInfo submesh_info = info;
const uint3_64 decomposition = decompose(nprocs);
const int3 pid3d = getPid3D(pid, decomposition);
printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
printf("Decomposition: %lu, %lu, %lu\n", decomposition.x, decomposition.y, decomposition.z);
printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
@@ -1089,79 +1211,19 @@ acGridInit(const AcMeshInfo info)
device->local_config.int_params[AC_nz],
};
// Corners
const int3 corner_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){nn.x, nn.y, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){nn.x, NGHOST, nn.z}, //
(int3){NGHOST, nn.y, nn.z}, //
(int3){nn.x, nn.y, nn.z},
};
// Edges X
const int3 edgex_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){NGHOST, nn.y, nn.z}, //
};
// Edges Y
const int3 edgey_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){nn.x, NGHOST, nn.z}, //
};
// Edges Z
const int3 edgez_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){nn.x, nn.y, NGHOST}, //
};
// Sides XY
const int3 sidexy_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
};
// Sides XZ
const int3 sidexz_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
};
// Sides YZ
const int3 sideyz_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
};
const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};
const int3 edgex_dims = (int3){nn.x, NGHOST, NGHOST};
const int3 edgey_dims = (int3){NGHOST, nn.y, NGHOST};
const int3 edgez_dims = (int3){NGHOST, NGHOST, nn.z};
const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};
const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};
const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};
grid.nn = nn;
grid.corner_data = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
grid.edgex_data = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
grid.edgey_data = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
grid.edgez_data = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
grid.sidexy_data = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
grid.sidexz_data = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
grid.sideyz_data = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));
// Create CommData
// We have 8 corners, 12 edges, and 6 sides
//
// For simplicity's sake all data blocks inside a single CommData struct
// have the same dimensions.
grid.nn = nn;
grid.corner_data = acCreateCommData(device, (int3){NGHOST, NGHOST, NGHOST}, 8);
grid.edgex_data = acCreateCommData(device, (int3){nn.x, NGHOST, NGHOST}, 4);
grid.edgey_data = acCreateCommData(device, (int3){NGHOST, nn.y, NGHOST}, 4);
grid.edgez_data = acCreateCommData(device, (int3){NGHOST, NGHOST, nn.z}, 4);
grid.sidexy_data = acCreateCommData(device, (int3){nn.x, nn.y, NGHOST}, 2);
grid.sidexz_data = acCreateCommData(device, (int3){nn.x, NGHOST, nn.z}, 2);
grid.sideyz_data = acCreateCommData(device, (int3){NGHOST, nn.y, nn.z}, 2);
acGridSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
@@ -1182,7 +1244,7 @@ acGridQuit(void)
acDestroyCommData(grid.device, &grid.sideyz_data);
grid.initialized = false;
grid.decomposition = (int3){-1, -1, -1};
grid.decomposition = (uint3_64){0, 0, 0};
acMeshDestroy(&grid.submesh);
acDeviceDestroy(grid.device);
@@ -1220,7 +1282,7 @@ AcResult
acGridIntegrate(const Stream stream, const AcReal dt)
{
ERRCHK(grid.initialized);
//acGridSynchronizeStream(stream);
// acGridSynchronizeStream(stream);
const Device device = grid.device;
const int3 nn = grid.nn;
@@ -1231,41 +1293,23 @@ acGridIntegrate(const Stream stream, const AcReal dt)
CommData sidexy_data = grid.sidexy_data;
CommData sidexz_data = grid.sidexz_data;
CommData sideyz_data = grid.sideyz_data;
acDeviceSynchronizeStream(device, stream);
acDeviceSynchronizeStream(device, stream);
// Corners
const int3 corner_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){nn.x, nn.y, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){nn.x, NGHOST, nn.z}, //
(int3){NGHOST, nn.y, nn.z}, //
(int3){nn.x, nn.y, nn.z},
};
const int3 corner_b0s[] = {
(int3){0, 0, 0},
(int3){NGHOST + nn.x, 0, 0},
(int3){0, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){0, 0, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
};
// Edges X
const int3 edgex_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){NGHOST, nn.y, nn.z}, //
};
const int3 edgex_b0s[] = {
(int3){NGHOST, 0, 0},
(int3){NGHOST, NGHOST + nn.y, 0},
@@ -1275,13 +1319,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
};
// Edges Y
const int3 edgey_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){nn.x, NGHOST, nn.z}, //
};
const int3 edgey_b0s[] = {
(int3){0, NGHOST, 0},
(int3){NGHOST + nn.x, NGHOST, 0},
@@ -1291,13 +1328,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
};
// Edges Z
const int3 edgez_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){nn.x, nn.y, NGHOST}, //
};
const int3 edgez_b0s[] = {
(int3){0, 0, NGHOST},
(int3){NGHOST + nn.x, 0, NGHOST},
@@ -1307,64 +1337,32 @@ acGridIntegrate(const Stream stream, const AcReal dt)
};
// Sides XY
const int3 sidexy_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
};
const int3 sidexy_b0s[] = {
(int3){NGHOST, NGHOST, 0}, //
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
};
// Sides XZ
const int3 sidexz_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
};
const int3 sidexz_b0s[] = {
(int3){NGHOST, 0, NGHOST}, //
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
};
// Sides YZ
const int3 sideyz_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
};
const int3 sideyz_b0s[] = {
(int3){0, NGHOST, NGHOST}, //
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
};
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acPackCommData(device, corner_a0s, &corner_data);
acPackCommData(device, edgex_a0s, &edgex_data);
acPackCommData(device, edgey_a0s, &edgey_data);
acPackCommData(device, edgez_a0s, &edgez_data);
acPackCommData(device, sidexy_a0s, &sidexy_data);
acPackCommData(device, sidexz_a0s, &sidexz_data);
acPackCommData(device, sideyz_a0s, &sideyz_data);
// acPackCommData(device, corner_b0s, &corner_data);
acPackCommData(device, edgex_b0s, &edgex_data);
acPackCommData(device, edgey_b0s, &edgey_data);
acPackCommData(device, edgez_b0s, &edgez_data);
acPackCommData(device, sidexy_b0s, &sidexy_data);
acPackCommData(device, sidexz_b0s, &sidexz_data);
acPackCommData(device, sideyz_b0s, &sideyz_data);
MPI_Barrier(MPI_COMM_WORLD);
#if MPI_GPUDIRECT_DISABLED
acTransferCommDataToHost(device, &corner_data);
acTransferCommDataToHost(device, &edgex_data);
acTransferCommDataToHost(device, &edgey_data);
acTransferCommDataToHost(device, &edgez_data);
acTransferCommDataToHost(device, &sidexy_data);
acTransferCommDataToHost(device, &sidexz_data);
acTransferCommDataToHost(device, &sideyz_data);
#endif
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
//////////// INNER INTEGRATION //////////////
{
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
@@ -1373,7 +1371,27 @@ acGridIntegrate(const Stream stream, const AcReal dt)
}
////////////////////////////////////////////
acTransferCommDataWait(corner_data);
MPI_Barrier(MPI_COMM_WORLD);
#if MPI_GPUDIRECT_DISABLED
// acTransferCommDataToHost(device, &corner_data);
acTransferCommDataToHost(device, &edgex_data);
acTransferCommDataToHost(device, &edgey_data);
acTransferCommDataToHost(device, &edgez_data);
acTransferCommDataToHost(device, &sidexy_data);
acTransferCommDataToHost(device, &sidexz_data);
acTransferCommDataToHost(device, &sideyz_data);
#endif
// acTransferCommData(device, corner_b0s, &corner_data);
acTransferCommData(device, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_b0s, &edgez_data);
acTransferCommData(device, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_b0s, &sideyz_data);
// acTransferCommDataWait(corner_data);
acTransferCommDataWait(edgex_data);
acTransferCommDataWait(edgey_data);
acTransferCommDataWait(edgez_data);
@@ -1382,7 +1400,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
acTransferCommDataWait(sideyz_data);
#if MPI_GPUDIRECT_DISABLED
acTransferCommDataToDevice(device, &corner_data);
// acTransferCommDataToDevice(device, &corner_data);
acTransferCommDataToDevice(device, &edgex_data);
acTransferCommDataToDevice(device, &edgey_data);
acTransferCommDataToDevice(device, &edgez_data);
@@ -1391,7 +1409,15 @@ acGridIntegrate(const Stream stream, const AcReal dt)
acTransferCommDataToDevice(device, &sideyz_data);
#endif
acUnpackCommData(device, corner_b0s, &corner_data);
// acUnpinCommData(device, &corner_data);
acUnpinCommData(device, &edgex_data);
acUnpinCommData(device, &edgey_data);
acUnpinCommData(device, &edgez_data);
acUnpinCommData(device, &sidexy_data);
acUnpinCommData(device, &sidexz_data);
acUnpinCommData(device, &sideyz_data);
// acUnpackCommData(device, corner_b0s, &corner_data);
acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data);
acUnpackCommData(device, edgez_b0s, &edgez_data);
@@ -1401,7 +1427,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
//////////// OUTER INTEGRATION //////////////
// Wait for unpacking
acSyncCommData(corner_data);
// acSyncCommData(corner_data);
acSyncCommData(edgex_data);
acSyncCommData(edgey_data);
acSyncCommData(edgez_data);
@@ -1463,37 +1489,19 @@ acGridPeriodicBoundconds(const Stream stream)
CommData sideyz_data = grid.sideyz_data;
// Corners
const int3 corner_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){nn.x, nn.y, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){nn.x, NGHOST, nn.z}, //
(int3){NGHOST, nn.y, nn.z}, //
(int3){nn.x, nn.y, nn.z},
};
const int3 corner_b0s[] = {
(int3){0, 0, 0},
(int3){NGHOST + nn.x, 0, 0},
(int3){0, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){0, 0, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
};
// Edges X
const int3 edgex_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){NGHOST, nn.y, nn.z}, //
};
const int3 edgex_b0s[] = {
(int3){NGHOST, 0, 0},
(int3){NGHOST, NGHOST + nn.y, 0},
@@ -1503,13 +1511,6 @@ acGridPeriodicBoundconds(const Stream stream)
};
// Edges Y
const int3 edgey_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
(int3){nn.x, NGHOST, nn.z}, //
};
const int3 edgey_b0s[] = {
(int3){0, NGHOST, 0},
(int3){NGHOST + nn.x, NGHOST, 0},
@@ -1519,13 +1520,6 @@ acGridPeriodicBoundconds(const Stream stream)
};
// Edges Z
const int3 edgez_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
(int3){nn.x, nn.y, NGHOST}, //
};
const int3 edgez_b0s[] = {
(int3){0, 0, NGHOST},
(int3){NGHOST + nn.x, 0, NGHOST},
@@ -1535,42 +1529,32 @@ acGridPeriodicBoundconds(const Stream stream)
};
// Sides XY
const int3 sidexy_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, NGHOST, nn.z}, //
};
const int3 sidexy_b0s[] = {
(int3){NGHOST, NGHOST, 0}, //
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
};
// Sides XZ
const int3 sidexz_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){NGHOST, nn.y, NGHOST}, //
};
const int3 sidexz_b0s[] = {
(int3){NGHOST, 0, NGHOST}, //
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
};
// Sides YZ
const int3 sideyz_a0s[] = {
(int3){NGHOST, NGHOST, NGHOST}, //
(int3){nn.x, NGHOST, NGHOST}, //
};
const int3 sideyz_b0s[] = {
(int3){0, NGHOST, NGHOST}, //
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
};
acPackCommData(device, corner_a0s, &corner_data);
acPackCommData(device, edgex_a0s, &edgex_data);
acPackCommData(device, edgey_a0s, &edgey_data);
acPackCommData(device, edgez_a0s, &edgez_data);
acPackCommData(device, sidexy_a0s, &sidexy_data);
acPackCommData(device, sidexz_a0s, &sidexz_data);
acPackCommData(device, sideyz_a0s, &sideyz_data);
acPackCommData(device, corner_b0s, &corner_data);
acPackCommData(device, edgex_b0s, &edgex_data);
acPackCommData(device, edgey_b0s, &edgey_data);
acPackCommData(device, edgez_b0s, &edgez_data);
acPackCommData(device, sidexy_b0s, &sidexy_data);
acPackCommData(device, sidexz_b0s, &sidexz_data);
acPackCommData(device, sideyz_b0s, &sideyz_data);
MPI_Barrier(MPI_COMM_WORLD);
#if MPI_GPUDIRECT_DISABLED
acTransferCommDataToHost(device, &corner_data);
@@ -1582,13 +1566,13 @@ acGridPeriodicBoundconds(const Stream stream)
acTransferCommDataToHost(device, &sideyz_data);
#endif
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
acTransferCommData(device, corner_b0s, &corner_data);
acTransferCommData(device, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_b0s, &edgez_data);
acTransferCommData(device, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_b0s, &sideyz_data);
acTransferCommDataWait(corner_data);
acTransferCommDataWait(edgex_data);
@@ -1608,6 +1592,14 @@ acGridPeriodicBoundconds(const Stream stream)
acTransferCommDataToDevice(device, &sideyz_data);
#endif
acUnpinCommData(device, &corner_data);
acUnpinCommData(device, &edgex_data);
acUnpinCommData(device, &edgey_data);
acUnpinCommData(device, &edgez_data);
acUnpinCommData(device, &sidexy_data);
acUnpinCommData(device, &sidexz_data);
acUnpinCommData(device, &sideyz_data);
acUnpackCommData(device, corner_b0s, &corner_data);
acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data);

View File

@@ -1,9 +1,19 @@
#pragma once
#include "astaroth.h"
#if AC_MPI_ENABLED
#include <mpi.h>
#include <stdbool.h>
#define MPI_GPUDIRECT_DISABLED (0)
#endif // AC_MPI_ENABLED
typedef struct {
int3 dims;
AcReal* data;
AcReal* data_pinned;
bool pinned = false; // Set if data was received to pinned memory
} PackedData;
typedef struct {