Merged the newest MPI changes
This commit is contained in:
@@ -89,6 +89,8 @@ if (BUILD_SAMPLES)
|
||||
add_subdirectory(samples/cpptest)
|
||||
add_subdirectory(samples/mpitest)
|
||||
add_subdirectory(samples/benchmark)
|
||||
add_subdirectory(samples/bwtest)
|
||||
add_subdirectory(samples/genbenchmarkscripts)
|
||||
endif()
|
||||
|
||||
if (BUILD_STANDALONE)
|
||||
|
@@ -89,19 +89,45 @@ main(void)
|
||||
}
|
||||
}*/
|
||||
|
||||
/*
|
||||
// Basic
|
||||
const size_t num_iters = 100;
|
||||
|
||||
// Warmup
|
||||
for (size_t i = 0; i < 10; ++i)
|
||||
for (size_t i = 0; i < num_iters / 10; ++i)
|
||||
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
|
||||
|
||||
// Benchmark
|
||||
Timer t;
|
||||
const AcReal dt = FLT_EPSILON;
|
||||
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
timer_reset(&t);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
|
||||
for (size_t i = 0; i < num_iters; ++i)
|
||||
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
if (!pid)
|
||||
timer_diff_print(t);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
*/
|
||||
|
||||
// Percentiles
|
||||
const size_t num_iters = 100;
|
||||
const double nth_percentile = 0.90;
|
||||
|
||||
std::vector<double> results; // ms
|
||||
results.reserve(num_iters);
|
||||
|
||||
// Warmup
|
||||
for (size_t i = 0; i < num_iters / 10; ++i)
|
||||
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
|
||||
|
||||
// Benchmark
|
||||
Timer t;
|
||||
const AcReal dt = FLT_EPSILON;
|
||||
|
||||
for (size_t i = 0; i < num_iters; ++i) {
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
timer_reset(&t);
|
||||
@@ -109,9 +135,9 @@ main(void)
|
||||
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
results.push_back(timer_diff_nsec(t) / 1e6);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
}
|
||||
|
||||
// Write benchmark to file
|
||||
if (!pid) {
|
||||
std::sort(results.begin(), results.end(),
|
||||
[](const double& a, const double& b) { return a < b; });
|
||||
@@ -137,6 +163,48 @@ main(void)
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/*
|
||||
const size_t num_iters = 100;
|
||||
const double nth_percentile = 0.90;
|
||||
|
||||
std::vector<double> results; // ms
|
||||
results.reserve(num_iters);
|
||||
|
||||
for (size_t i = 0; i < num_iters; ++i) {
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
timer_reset(&t);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
results.push_back(timer_diff_nsec(t) / 1e6);
|
||||
}
|
||||
|
||||
// Write benchmark to file
|
||||
if (!pid) {
|
||||
std::sort(results.begin(), results.end(),
|
||||
[](const double& a, const double& b) { return a < b; });
|
||||
fprintf(stdout,
|
||||
"Integration step time %g ms (%gth "
|
||||
"percentile)--------------------------------------\n",
|
||||
results[nth_percentile * num_iters], 100 * nth_percentile);
|
||||
|
||||
char path[4096] = "";
|
||||
if (test == TEST_STRONG_SCALING)
|
||||
strncpy(path, "strong_scaling.csv", sizeof(path));
|
||||
else if (test == TEST_WEAK_SCALING)
|
||||
strncpy(path, "weak_scaling.csv", sizeof(path));
|
||||
else
|
||||
ERROR("Invalid test type");
|
||||
|
||||
FILE* fp = fopen(path, "a");
|
||||
ERRCHK_ALWAYS(fp);
|
||||
// Format
|
||||
// nprocs, measured (ms)
|
||||
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
|
||||
|
||||
fclose(fp);
|
||||
}*/
|
||||
|
||||
acGridQuit();
|
||||
MPI_Finalize();
|
||||
return EXIT_SUCCESS;
|
||||
|
9
samples/bwtest/CMakeLists.txt
Normal file
9
samples/bwtest/CMakeLists.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking
|
||||
|
||||
find_package(MPI)
|
||||
find_package(OpenMP)
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
add_executable(bwtest main.c)
|
||||
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
|
||||
target_compile_options(bwtest PRIVATE -O3)
|
486
samples/bwtest/main.c
Normal file
486
samples/bwtest/main.c
Normal file
@@ -0,0 +1,486 @@
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include "timer_hires.h" // From src/common
|
||||
|
||||
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
|
||||
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
|
||||
|
||||
/*
|
||||
Findings:
|
||||
- MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
|
||||
- Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
|
||||
- For internode one should use pinned memory (RDMA is staged through pinned, gives full
|
||||
network speed iff pinned)
|
||||
- Both the sending and receiving arrays must be pinned to see performance improvement
|
||||
in internode comm
|
||||
*/
|
||||
|
||||
static uint8_t*
|
||||
allocHost(const size_t bytes)
|
||||
{
|
||||
uint8_t* arr = malloc(bytes);
|
||||
assert(arr);
|
||||
return arr;
|
||||
}
|
||||
|
||||
static void
|
||||
freeHost(uint8_t* arr)
|
||||
{
|
||||
free(arr);
|
||||
}
|
||||
|
||||
static uint8_t*
|
||||
allocDevice(const size_t bytes)
|
||||
{
|
||||
uint8_t* arr;
|
||||
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||
const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||
assert(retval == cudaSuccess);
|
||||
return arr;
|
||||
}
|
||||
|
||||
static uint8_t*
|
||||
allocDevicePinned(const size_t bytes)
|
||||
{
|
||||
uint8_t* arr;
|
||||
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||
assert(retval == cudaSuccess);
|
||||
return arr;
|
||||
}
|
||||
|
||||
static void
|
||||
freeDevice(uint8_t* arr)
|
||||
{
|
||||
cudaFree(arr);
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_blocking(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
int nfront = (pid + 1) % nprocs;
|
||||
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
if (!pid) {
|
||||
MPI_Status status;
|
||||
MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
|
||||
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
|
||||
}
|
||||
else {
|
||||
MPI_Status status;
|
||||
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
|
||||
MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_nonblocking(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
int nfront = (pid + 1) % nprocs;
|
||||
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
MPI_Request recv_request, send_request;
|
||||
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request);
|
||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request);
|
||||
|
||||
MPI_Status status;
|
||||
MPI_Wait(&recv_request, &status);
|
||||
MPI_Wait(&send_request, &status);
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_twoway(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
int nfront = (pid + 1) % nprocs;
|
||||
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
MPI_Status status;
|
||||
MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback,
|
||||
MPI_COMM_WORLD, &status);
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
MPI_Request recv_requests[nprocs], send_requests[nprocs];
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
int nfront = (pid + i) % nprocs;
|
||||
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
|
||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
|
||||
}
|
||||
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Wait(&recv_requests[i], &status);
|
||||
MPI_Wait(&send_requests[i], &status);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
MPI_Request recv_requests[nprocs], send_requests[nprocs];
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
int nfront = (pid + i) % nprocs;
|
||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
|
||||
}
|
||||
|
||||
static bool error_shown = false;
|
||||
if (!pid && !error_shown) {
|
||||
fprintf(stderr, "\tWARNING: make sure you init MPI_Init_thread for OpenMP support (no "
|
||||
"supported on puhti atm "
|
||||
"2020-04-05\n");
|
||||
error_shown = true;
|
||||
}
|
||||
#pragma omp parallel for
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
MPI_Status status;
|
||||
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &status);
|
||||
}
|
||||
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Wait(&send_requests[i], &status);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
static uint8_t* src_pinned = NULL;
|
||||
static uint8_t* dst_pinned = NULL;
|
||||
if (!src_pinned)
|
||||
src_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
|
||||
if (!dst_pinned)
|
||||
dst_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
|
||||
|
||||
int devices_per_node = -1;
|
||||
cudaGetDeviceCount(&devices_per_node);
|
||||
|
||||
const int node_id = pid / devices_per_node;
|
||||
|
||||
MPI_Request recv_requests[nprocs], send_requests[nprocs];
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
int nfront = (pid + i) % nprocs;
|
||||
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
if (nback / devices_per_node != pid / devices_per_node) // Not on the same node
|
||||
MPI_Irecv(dst_pinned, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD,
|
||||
&recv_requests[i]);
|
||||
else
|
||||
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
|
||||
|
||||
if (nfront / devices_per_node != pid / devices_per_node) // Not on the same node
|
||||
MPI_Isend(src_pinned, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD,
|
||||
&send_requests[i]);
|
||||
else
|
||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
|
||||
}
|
||||
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Wait(&recv_requests[i], &status);
|
||||
MPI_Wait(&send_requests[i], &status);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
send_d2h(const uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
static void
|
||||
send_h2d(const uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
sendrecv_d2h2d(const uint8_t* dsrc, uint8_t* hdst, const uint8_t* hsrc, uint8_t* ddst)
|
||||
{
|
||||
cudaStream_t d2h, h2d;
|
||||
cudaStreamCreate(&d2h);
|
||||
cudaStreamCreate(&h2d);
|
||||
|
||||
cudaMemcpyAsync(hdst, dsrc, BLOCK_SIZE, cudaMemcpyDeviceToHost, d2h);
|
||||
cudaMemcpyAsync(ddst, hsrc, BLOCK_SIZE, cudaMemcpyHostToDevice, h2d);
|
||||
|
||||
cudaStreamSynchronize(d2h);
|
||||
cudaStreamSynchronize(h2d);
|
||||
|
||||
cudaStreamDestroy(d2h);
|
||||
cudaStreamDestroy(h2d);
|
||||
}
|
||||
|
||||
#define PRINT \
|
||||
if (!pid) \
|
||||
printf
|
||||
|
||||
static void
|
||||
measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
|
||||
uint8_t* dst)
|
||||
{
|
||||
const size_t num_samples = 100;
|
||||
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
PRINT("%s\n", msg);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
PRINT("\tWarming up... ");
|
||||
for (size_t i = 0; i < num_samples / 10; ++i)
|
||||
sendrecv(src, dst);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
PRINT("Done\n");
|
||||
|
||||
PRINT("\tBandwidth... ");
|
||||
fflush(stdout);
|
||||
|
||||
Timer t;
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
timer_reset(&t);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
sendrecv(src, dst);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
|
||||
PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
|
||||
PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(const uint8_t*, uint8_t*, const uint8_t*, uint8_t*), const uint8_t* dsrc, uint8_t* hdst,
|
||||
const uint8_t* hsrc, uint8_t* ddst)
|
||||
{
|
||||
const size_t num_samples = 100;
|
||||
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
PRINT("%s\n", msg);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
PRINT("\tWarming up... ");
|
||||
for (size_t i = 0; i < num_samples / 10; ++i)
|
||||
sendrecv(dsrc, hdst, hsrc, ddst);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
PRINT("Done\n");
|
||||
|
||||
PRINT("\tBandwidth... ");
|
||||
fflush(stdout);
|
||||
|
||||
Timer t;
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
timer_reset(&t);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
sendrecv(dsrc, hdst, hsrc, ddst);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
|
||||
PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
|
||||
PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
MPI_Init(NULL, NULL);
|
||||
// int provided;
|
||||
// MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
|
||||
// assert(provided >= MPI_THREAD_MULTIPLE);
|
||||
|
||||
// Disable stdout buffering
|
||||
setbuf(stdout, NULL);
|
||||
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
assert(nprocs >= 2); // Require at least one neighbor
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (!pid) {
|
||||
printf("Do we have threads? The following should not be ordered (unless very lucky)\n");
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < 10; ++i)
|
||||
printf("%d, ", i);
|
||||
printf("\n");
|
||||
}
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
int devices_per_node = -1;
|
||||
cudaGetDeviceCount(&devices_per_node);
|
||||
const int device_id = pid % devices_per_node;
|
||||
cudaSetDevice(device_id);
|
||||
|
||||
printf("Process %d of %d running.\n", pid, nprocs);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
|
||||
|
||||
#if 1
|
||||
{
|
||||
uint8_t* src = allocHost(BLOCK_SIZE);
|
||||
uint8_t* dst = allocHost(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional bandwidth, blocking (Host)", //
|
||||
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async (Host)", //
|
||||
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, twoway (Host)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Host)", //
|
||||
2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
//measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
|
||||
// 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
||||
|
||||
freeHost(src);
|
||||
freeHost(dst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
|
||||
{
|
||||
uint8_t* src = allocDevice(BLOCK_SIZE);
|
||||
uint8_t* dst = allocDevice(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional bandwidth, blocking (Device)", //
|
||||
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async (Device)", //
|
||||
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, twoway (Device)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device)", //
|
||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
//measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
|
||||
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
|
||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
|
||||
|
||||
freeDevice(src);
|
||||
freeDevice(dst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
|
||||
{
|
||||
uint8_t* src = allocDevicePinned(BLOCK_SIZE);
|
||||
uint8_t* dst = allocDevicePinned(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional bandwidth, blocking (Device, pinned)", //
|
||||
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async (Device, pinned)", //
|
||||
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
|
||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
|
||||
freeDevice(src);
|
||||
freeDevice(dst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
|
||||
{
|
||||
uint8_t* hsrc = allocHost(BLOCK_SIZE);
|
||||
uint8_t* hdst = allocHost(BLOCK_SIZE);
|
||||
uint8_t* dsrc = allocDevice(BLOCK_SIZE);
|
||||
uint8_t* ddst = allocDevice(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
||||
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
||||
|
||||
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
|
||||
|
||||
freeDevice(dsrc);
|
||||
freeDevice(ddst);
|
||||
freeHost(hsrc);
|
||||
freeHost(hdst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
|
||||
{
|
||||
uint8_t* hsrc = allocHost(BLOCK_SIZE);
|
||||
uint8_t* hdst = allocHost(BLOCK_SIZE);
|
||||
uint8_t* dsrc = allocDevicePinned(BLOCK_SIZE);
|
||||
uint8_t* ddst = allocDevicePinned(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
||||
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
||||
|
||||
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
|
||||
|
||||
freeDevice(dsrc);
|
||||
freeDevice(ddst);
|
||||
freeHost(hsrc);
|
||||
freeHost(hdst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
#else
|
||||
{ // Final run for easy identification with the profiler
|
||||
uint8_t* src = allocDevice(BLOCK_SIZE);
|
||||
uint8_t* dst = allocDevice(BLOCK_SIZE);
|
||||
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
|
||||
2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
|
||||
|
||||
freeDevice(src);
|
||||
freeDevice(dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
MPI_Finalize();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
8
samples/genbenchmarkscripts/CMakeLists.txt
Normal file
8
samples/genbenchmarkscripts/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
add_executable(genbenchmarkscripts main.c)
|
||||
|
||||
add_custom_command(
|
||||
TARGET genbenchmarkscripts POST_BUILD
|
||||
COMMAND genbenchmarkscripts
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
|
||||
COMMENT "Generating benchmark scripts"
|
||||
)
|
46
samples/genbenchmarkscripts/main.c
Normal file
46
samples/genbenchmarkscripts/main.c
Normal file
@@ -0,0 +1,46 @@
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
const int max_nprocs = 128;
|
||||
for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
|
||||
char filename[4096];
|
||||
sprintf(filename, "benchmark_%d.sh", nprocs);
|
||||
|
||||
FILE* fp = fopen(filename, "w");
|
||||
assert(fp);
|
||||
|
||||
// Boilerplate
|
||||
fprintf(fp, "#!/bin/bash\n");
|
||||
fprintf(fp, "#BATCH --job-name=astaroth\n");
|
||||
fprintf(fp, "#SBATCH --account=project_2000403\n");
|
||||
fprintf(fp, "#SBATCH --time=00:14:59\n");
|
||||
fprintf(fp, "#SBATCH --mem=32000\n");
|
||||
fprintf(fp, "#SBATCH --partition=gpu\n");
|
||||
|
||||
// nprocs, nodes, gpus
|
||||
const int max_gpus_per_node = 4;
|
||||
const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
|
||||
const int nodes = (int)ceil((double)nprocs / max_gpus_per_node);
|
||||
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
|
||||
fprintf(fp, "#SBATCH -n %d\n", nprocs);
|
||||
fprintf(fp, "#SBATCH -N %d\n", nodes);
|
||||
|
||||
// Modules
|
||||
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
||||
fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
|
||||
|
||||
// Profile and run
|
||||
fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
||||
fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
|
||||
nprocs);
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@@ -74,7 +74,8 @@ main(void)
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with cmake -DMPI_ENABLED=ON .. to enable.\n");
|
||||
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
|
||||
"cmake -DMPI_ENABLED=ON .. to enable.\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
#endif // AC_MPI_ENABLES
|
||||
|
80
samples/mpitest/main_old.cc
Normal file
80
samples/mpitest/main_old.cc
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
/**
|
||||
Running: mpirun -np <num processes> <executable>
|
||||
*/
|
||||
#include "astaroth.h"
|
||||
#include "astaroth_utils.h"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
MPI_Init(NULL, NULL);
|
||||
int nprocs, pid;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
|
||||
// CPU alloc
|
||||
AcMeshInfo info;
|
||||
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
||||
info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx];
|
||||
info.real_params[AC_inv_dsy] = AcReal(1.0) / info.real_params[AC_dsy];
|
||||
info.real_params[AC_inv_dsz] = AcReal(1.0) / info.real_params[AC_dsz];
|
||||
info.real_params[AC_cs2_sound] = info.real_params[AC_cs_sound] * info.real_params[AC_cs_sound];
|
||||
|
||||
AcMesh model, candidate;
|
||||
if (pid == 0) {
|
||||
acMeshCreate(info, &model);
|
||||
acMeshCreate(info, &candidate);
|
||||
acMeshRandomize(&model);
|
||||
acMeshRandomize(&candidate);
|
||||
}
|
||||
|
||||
// GPU alloc & compute
|
||||
Grid grid;
|
||||
acGridCreateMPI(info, &grid);
|
||||
|
||||
acGridLoadMeshMPI(grid, STREAM_DEFAULT, model);
|
||||
acGridSynchronizeStreamMPI(grid, STREAM_ALL);
|
||||
|
||||
acGridIntegrateMPI(grid, FLT_EPSILON);
|
||||
acGridSynchronizeStreamMPI(grid, STREAM_ALL);
|
||||
acGridSynchronizeMeshMPI(grid, STREAM_DEFAULT);
|
||||
acGridSynchronizeStreamMPI(grid, STREAM_ALL);
|
||||
|
||||
acGridStoreMeshMPI(grid, STREAM_DEFAULT, &candidate);
|
||||
acGridSynchronizeStreamMPI(grid, STREAM_ALL);
|
||||
|
||||
acGridDestroyMPI(grid);
|
||||
|
||||
// Verify
|
||||
if (pid == 0) {
|
||||
acModelIntegrateStep(model, FLT_EPSILON);
|
||||
acMeshApplyPeriodicBounds(&model);
|
||||
|
||||
acVerifyMesh(model, candidate);
|
||||
acMeshDestroy(&model);
|
||||
acMeshDestroy(&candidate);
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@@ -105,7 +105,8 @@ operator+(const int3& a, const int3& b)
|
||||
return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
|
||||
static HOST_DEVICE_INLINE int3
|
||||
operator*(const int3& a, const int3& b)
|
||||
{
|
||||
return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
|
||||
}
|
||||
@@ -144,12 +145,20 @@ operator-=(AcReal3& lhs, const AcReal3& rhs)
|
||||
lhs.z -= rhs.z;
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
|
||||
static HOST_DEVICE_INLINE int3
|
||||
operator*(const int& a, const int3& b)
|
||||
{
|
||||
return (int3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE AcReal3
|
||||
operator*(const AcReal& a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
|
||||
static HOST_DEVICE_INLINE AcReal3
|
||||
operator*(const AcReal3& b, const AcReal& a)
|
||||
{
|
||||
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
|
@@ -1,11 +1,14 @@
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
## Astaroth Core
|
||||
add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
|
||||
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
|
||||
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart)
|
||||
|
||||
## Options
|
||||
if (MPI_ENABLED)
|
||||
find_package(MPI)
|
||||
target_link_libraries(astaroth_core MPI::MPI_CXX)
|
||||
find_package(OpenMP)
|
||||
target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (MULTIGPU_ENABLED)
|
||||
|
@@ -460,64 +460,139 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
|
||||
}
|
||||
|
||||
#if AC_MPI_ENABLED
|
||||
/**
|
||||
Quick overview of the MPI implementation:
|
||||
|
||||
The halo is partitioned into segments. The first coordinate of a segment is b0.
|
||||
The array containing multiple b0s is called... "b0s".
|
||||
|
||||
Each b0 maps to an index in the computational domain of some neighboring process a0.
|
||||
We have a0 = mod(b0 - nghost, nn) + nghost.
|
||||
Intuitively, we
|
||||
1) Transform b0 into a coordinate system where (0, 0, 0) is the first index in
|
||||
the comp domain.
|
||||
2) Wrap the transformed b0 around nn (comp domain)
|
||||
3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index
|
||||
in the ghost zone
|
||||
|
||||
struct PackedData is used for packing and unpacking. Holds the actual data in
|
||||
the halo partition
|
||||
struct CommData holds multiple PackedDatas for sending and receiving halo
|
||||
partitions
|
||||
struct Grid contains information about the local GPU device, decomposition, the
|
||||
total mesh dimensions and CommDatas
|
||||
|
||||
|
||||
Basic steps:
|
||||
1) Distribute the mesh among ranks
|
||||
2) Integrate & communicate
|
||||
- start inner integration and at the same time, pack halo data and send it to neighbors
|
||||
- once all halo data has been received, unpack and do outer integration
|
||||
- sync and start again
|
||||
3) Gather the mesh to rank 0 for postprocessing
|
||||
*/
|
||||
#include <mpi.h>
|
||||
|
||||
static int
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y, z;
|
||||
} uint3_64;
|
||||
|
||||
static uint3_64
|
||||
operator+(const uint3_64& a, const uint3_64& b)
|
||||
{
|
||||
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||
}
|
||||
|
||||
static int3
|
||||
make_int3(const uint3_64 a)
|
||||
{
|
||||
return (int3){(int)a.x, (int)a.y, (int)a.z};
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
mod(const int a, const int b)
|
||||
{
|
||||
const int r = a % b;
|
||||
return r < 0 ? r + b : r;
|
||||
}
|
||||
|
||||
static int
|
||||
getPid(const int3 pid, const int3 decomposition)
|
||||
static uint3_64
|
||||
morton3D(const uint64_t pid)
|
||||
{
|
||||
return mod(pid.x, decomposition.x) + //
|
||||
mod(pid.y, decomposition.y) * decomposition.x + //
|
||||
mod(pid.z, decomposition.z) * decomposition.x * decomposition.y;
|
||||
uint64_t i, j, k;
|
||||
i = j = k = 0;
|
||||
for (int bit = 0; bit <= 21; ++bit) {
|
||||
const uint64_t mask = 0x1l << 3 * bit;
|
||||
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
||||
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||
}
|
||||
|
||||
return (uint3_64){i, j, k};
|
||||
}
|
||||
|
||||
static int3
|
||||
getPid3D(const int pid, const int3 decomposition)
|
||||
static uint64_t
|
||||
morton1D(const uint3_64 pid)
|
||||
{
|
||||
const int3 pid3d = (int3){
|
||||
mod(pid, decomposition.x),
|
||||
mod(pid / decomposition.x, decomposition.y),
|
||||
(pid / (decomposition.x * decomposition.y)),
|
||||
uint64_t i = 0;
|
||||
for (int bit = 0; bit <= 21; ++bit) {
|
||||
const uint64_t mask = 0x1l << bit;
|
||||
i |= ((pid.x & mask) << 0) << 2 * bit;
|
||||
i |= ((pid.y & mask) << 1) << 2 * bit;
|
||||
i |= ((pid.z & mask) << 2) << 2 * bit;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
static uint3_64
|
||||
decompose(const uint64_t target)
|
||||
{
|
||||
// This is just so beautifully elegant. Complex and efficient decomposition
|
||||
// in just one line of code.
|
||||
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
|
||||
|
||||
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
|
||||
return p;
|
||||
}
|
||||
|
||||
static uint3_64
|
||||
wrap(const int3 i, const uint3_64 n)
|
||||
{
|
||||
return (uint3_64){
|
||||
mod(i.x, n.x),
|
||||
mod(i.y, n.y),
|
||||
mod(i.z, n.z),
|
||||
};
|
||||
return pid3d;
|
||||
}
|
||||
|
||||
static int
|
||||
getPid(const int3 pid_raw, const uint3_64 decomp)
|
||||
{
|
||||
const uint3_64 pid = wrap(pid_raw, decomp);
|
||||
return (int)morton1D(pid);
|
||||
}
|
||||
|
||||
static int3
|
||||
decompose(const int target)
|
||||
getPid3D(const uint64_t pid, const uint3_64 decomp)
|
||||
{
|
||||
if (target == 16)
|
||||
return (int3){4, 2, 2};
|
||||
if (target == 32)
|
||||
return (int3){4, 4, 2};
|
||||
if (target == 128)
|
||||
return (int3){8, 4, 4};
|
||||
if (target == 256)
|
||||
return (int3){8, 8, 4};
|
||||
const uint3_64 pid3D = morton3D(pid);
|
||||
ERRCHK_ALWAYS(getPid(make_int3(pid3D), decomp) == (int)pid);
|
||||
return (int3){(int)pid3D.x, (int)pid3D.y, (int)pid3D.z};
|
||||
}
|
||||
|
||||
int decomposition[] = {1, 1, 1};
|
||||
/** Assumes that contiguous pids are on the same node and there is one process per GPU. */
|
||||
static inline bool
|
||||
onTheSameNode(const uint64_t pid_a, const uint64_t pid_b)
|
||||
{
|
||||
int devices_per_node = -1;
|
||||
cudaGetDeviceCount(&devices_per_node);
|
||||
|
||||
int axis = 0;
|
||||
while (decomposition[0] * decomposition[1] * decomposition[2] < target) {
|
||||
++decomposition[axis];
|
||||
axis = (axis + 1) % 3;
|
||||
}
|
||||
const uint64_t node_a = pid_a / devices_per_node;
|
||||
const uint64_t node_b = pid_b / devices_per_node;
|
||||
|
||||
const int found = decomposition[0] * decomposition[1] * decomposition[2];
|
||||
if (found != target) {
|
||||
fprintf(stderr, "Invalid number of processes! Cannot decompose the problem domain!\n");
|
||||
fprintf(stderr, "Target nprocs: %d. Next allowed: %d\n", target, found);
|
||||
ERROR("Invalid nprocs");
|
||||
return (int3){-1, -1, -1};
|
||||
}
|
||||
else {
|
||||
return (int3){decomposition[0], decomposition[1], decomposition[2]};
|
||||
}
|
||||
return node_a == node_b;
|
||||
}
|
||||
|
||||
static PackedData
|
||||
@@ -530,12 +605,18 @@ acCreatePackedData(const int3 dims)
|
||||
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
|
||||
|
||||
ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
|
||||
// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
|
||||
// slower than pinned (38 ms vs. 125 ms)
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
acDestroyPackedData(PackedData* data)
|
||||
{
|
||||
cudaFree(data->data_pinned);
|
||||
|
||||
data->dims = (int3){-1, -1, -1};
|
||||
cudaFree(data->data);
|
||||
data->data = NULL;
|
||||
@@ -558,6 +639,16 @@ acCreatePackedDataHost(const int3 dims)
|
||||
return data;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
acDestroyPackedDataHost(PackedData* data)
|
||||
{
|
||||
data->dims = (int3){-1, -1, -1};
|
||||
free(data->data);
|
||||
data->data = NULL;
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static void
|
||||
acTransferPackedDataToHost(const Device device, const cudaStream_t stream, const PackedData ddata,
|
||||
PackedData* hdata)
|
||||
@@ -579,21 +670,38 @@ acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, con
|
||||
NUM_VTXBUF_HANDLES;
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
static AcResult
|
||||
acDestroyPackedDataHost(PackedData* data)
|
||||
{
|
||||
data->dims = (int3){-1, -1, -1};
|
||||
free(data->data);
|
||||
data->data = NULL;
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
#endif // MPI_GPUDIRECT_DISABLED
|
||||
|
||||
static void
|
||||
acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
// TODO sync stream
|
||||
ddata->pinned = true;
|
||||
|
||||
const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
|
||||
NUM_VTXBUF_HANDLES;
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream));
|
||||
}
|
||||
|
||||
static void
|
||||
acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
|
||||
{
|
||||
if (!ddata->pinned) // Unpin iff the data was pinned previously
|
||||
return;
|
||||
|
||||
cudaSetDevice(device->id);
|
||||
// TODO sync stream
|
||||
ddata->pinned = false;
|
||||
|
||||
const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
|
||||
NUM_VTXBUF_HANDLES;
|
||||
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
|
||||
}
|
||||
|
||||
// TODO: do with packed data
|
||||
static AcResult
|
||||
acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
||||
acDeviceDistributeMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
|
||||
{
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
printf("Distributing mesh...\n");
|
||||
@@ -669,7 +777,7 @@ acDeviceDistributeMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* ds
|
||||
|
||||
// TODO: do with packed data
|
||||
static AcResult
|
||||
acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
||||
acDeviceGatherMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
|
||||
{
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
printf("Gathering mesh...\n");
|
||||
@@ -803,7 +911,9 @@ acCreateCommData(const Device device, const int3 dims, const size_t count)
|
||||
data.dsts_host[i] = acCreatePackedDataHost(dims);
|
||||
#endif
|
||||
|
||||
cudaStreamCreate(&data.streams[i]);
|
||||
int low_prio, high_prio;
|
||||
cudaDeviceGetStreamPriorityRange(&low_prio, &high_prio);
|
||||
cudaStreamCreateWithPriority(&data.streams[i], cudaStreamNonBlocking, high_prio);
|
||||
}
|
||||
|
||||
return data;
|
||||
@@ -849,12 +959,28 @@ acSyncCommData(const CommData data)
|
||||
cudaStreamSynchronize(data.streams[i]);
|
||||
}
|
||||
|
||||
static int3
|
||||
mod(const int3 a, const int3 n)
|
||||
{
|
||||
return (int3){mod(a.x, n.x), mod(a.y, n.y), mod(a.z, n.z)};
|
||||
}
|
||||
|
||||
static void
|
||||
acPackCommData(const Device device, const int3* a0s, CommData* data)
|
||||
acPackCommData(const Device device, const int3* b0s, CommData* data)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (size_t i = 0; i < data->count; ++i)
|
||||
acKernelPackData(data->streams[i], device->vba, a0s[i], data->srcs[i]);
|
||||
|
||||
const int3 nn = (int3){
|
||||
device->local_config.int_params[AC_nx],
|
||||
device->local_config.int_params[AC_ny],
|
||||
device->local_config.int_params[AC_nz],
|
||||
};
|
||||
const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
|
||||
|
||||
for (size_t i = 0; i < data->count; ++i) {
|
||||
const int3 a0 = mod(b0s[i] - nghost, nn) + nghost;
|
||||
acKernelPackData(data->streams[i], device->vba, a0, data->srcs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -884,10 +1010,31 @@ acTransferCommDataToDevice(const Device device, CommData* data)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void
|
||||
acPinCommData(const Device device, CommData* data)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (size_t i = 0; i < data->count; ++i)
|
||||
acPinPackedData(device, data->streams[i], &data->srcs[i]);
|
||||
}
|
||||
|
||||
static void
|
||||
acUnpinCommData(const Device device, CommData* data)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
// Clear pin flags from src
|
||||
for (size_t i = 0; i < data->count; ++i)
|
||||
data->srcs[i].pinned = false;
|
||||
|
||||
// Transfer from pinned to gmem
|
||||
for (size_t i = 0; i < data->count; ++i)
|
||||
acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
|
||||
}
|
||||
|
||||
static AcResult
|
||||
acTransferCommData(const Device device, //
|
||||
const int3* a0s, // Src idx inside comp. domain
|
||||
const int3* b0s, // Dst idx inside bound zone
|
||||
const int3* b0s, // Halo partition coordinates
|
||||
CommData* data)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
@@ -899,7 +1046,7 @@ acTransferCommData(const Device device, //
|
||||
int nprocs, pid;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
const int3 decomp = decompose(nprocs);
|
||||
const uint3_64 decomp = decompose(nprocs);
|
||||
|
||||
const int3 nn = (int3){
|
||||
device->local_config.int_params[AC_nx],
|
||||
@@ -907,80 +1054,55 @@ acTransferCommData(const Device device, //
|
||||
device->local_config.int_params[AC_nz],
|
||||
};
|
||||
|
||||
const int3 pid3d = getPid3D(pid, decomp);
|
||||
const int3 dims = data->dims;
|
||||
const size_t blockcount = data->count;
|
||||
|
||||
for (int k = -1; k <= 1; ++k) {
|
||||
for (int j = -1; j <= 1; ++j) {
|
||||
for (int i = -1; i <= 1; ++i) {
|
||||
if (i == 0 && j == 0 && k == 0)
|
||||
continue;
|
||||
|
||||
for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
|
||||
for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
|
||||
const int3 neighbor = (int3){i, j, k};
|
||||
|
||||
const int3 a0 = a0s[a_idx];
|
||||
// const int3 a1 = a0 + dims;
|
||||
|
||||
const int3 b0 = a0 - neighbor * nn;
|
||||
// const int3 b1 = a1 - neighbor * nn;
|
||||
|
||||
if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
|
||||
|
||||
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
||||
const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
|
||||
|
||||
#if MPI_GPUDIRECT_DISABLED
|
||||
PackedData dst = data->dsts_host[b_idx];
|
||||
#else
|
||||
PackedData dst = data->dsts[b_idx];
|
||||
#endif
|
||||
for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
|
||||
|
||||
const int3 pid3d = getPid3D(pid, decomp);
|
||||
MPI_Irecv(dst.data, count, datatype, getPid(pid3d - neighbor, decomp),
|
||||
b_idx, MPI_COMM_WORLD, &data->recv_reqs[b_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
const int3 b0 = b0s[b0_idx];
|
||||
const int3 neighbor = (int3){
|
||||
b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
|
||||
b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
|
||||
b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
|
||||
};
|
||||
const int npid = getPid(pid3d + neighbor, decomp);
|
||||
|
||||
PackedData* dst = &data->dsts[b0_idx];
|
||||
if (onTheSameNode(pid, npid)) {
|
||||
MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
|
||||
MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
|
||||
dst->pinned = false;
|
||||
}
|
||||
else {
|
||||
MPI_Irecv(dst->data_pinned, count, datatype, npid, b0_idx, //
|
||||
MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
|
||||
dst->pinned = true;
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = -1; k <= 1; ++k) {
|
||||
for (int j = -1; j <= 1; ++j) {
|
||||
for (int i = -1; i <= 1; ++i) {
|
||||
if (i == 0 && j == 0 && k == 0)
|
||||
continue;
|
||||
for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
|
||||
const int3 b0 = b0s[b0_idx];
|
||||
const int3 neighbor = (int3){
|
||||
b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
|
||||
b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
|
||||
b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
|
||||
};
|
||||
const int npid = getPid(pid3d - neighbor, decomp);
|
||||
|
||||
for (size_t a_idx = 0; a_idx < blockcount; ++a_idx) {
|
||||
for (size_t b_idx = 0; b_idx < blockcount; ++b_idx) {
|
||||
const int3 neighbor = (int3){i, j, k};
|
||||
|
||||
const int3 a0 = a0s[a_idx];
|
||||
// const int3 a1 = a0 + dims;
|
||||
|
||||
const int3 b0 = a0 - neighbor * nn;
|
||||
// const int3 b1 = a1 - neighbor * nn;
|
||||
|
||||
if (b0s[b_idx].x == b0.x && b0s[b_idx].y == b0.y && b0s[b_idx].z == b0.z) {
|
||||
|
||||
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
|
||||
|
||||
#if MPI_GPUDIRECT_DISABLED
|
||||
PackedData src = data->srcs_host[a_idx];
|
||||
#else
|
||||
PackedData src = data->srcs[a_idx];
|
||||
#endif
|
||||
|
||||
const int3 pid3d = getPid3D(pid, decomp);
|
||||
|
||||
cudaStreamSynchronize(data->streams[a_idx]);
|
||||
MPI_Isend(src.data, count, datatype, getPid(pid3d + neighbor, decomp),
|
||||
b_idx, MPI_COMM_WORLD, &data->send_reqs[b_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
PackedData* src = &data->srcs[b0_idx];
|
||||
if (onTheSameNode(pid, npid)) {
|
||||
cudaStreamSynchronize(data->streams[b0_idx]);
|
||||
MPI_Isend(src->data, count, datatype, npid, b0_idx, //
|
||||
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
|
||||
}
|
||||
else {
|
||||
acPinPackedData(device, data->streams[b0_idx], src);
|
||||
cudaStreamSynchronize(data->streams[b0_idx]);
|
||||
MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
|
||||
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -999,7 +1121,7 @@ acTransferCommDataWait(const CommData data)
|
||||
typedef struct {
|
||||
Device device;
|
||||
AcMesh submesh;
|
||||
int3 decomposition;
|
||||
uint3_64 decomposition;
|
||||
bool initialized;
|
||||
|
||||
int3 nn;
|
||||
@@ -1041,10 +1163,10 @@ acGridInit(const AcMeshInfo info)
|
||||
|
||||
// Decompose
|
||||
AcMeshInfo submesh_info = info;
|
||||
const int3 decomposition = decompose(nprocs);
|
||||
const uint3_64 decomposition = decompose(nprocs);
|
||||
const int3 pid3d = getPid3D(pid, decomposition);
|
||||
|
||||
printf("Decomposition: %d, %d, %d\n", decomposition.x, decomposition.y, decomposition.z);
|
||||
printf("Decomposition: %lu, %lu, %lu\n", decomposition.x, decomposition.y, decomposition.z);
|
||||
printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
|
||||
ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
|
||||
ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
|
||||
@@ -1089,79 +1211,19 @@ acGridInit(const AcMeshInfo info)
|
||||
device->local_config.int_params[AC_nz],
|
||||
};
|
||||
|
||||
// Corners
|
||||
const int3 corner_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
(int3){nn.x, nn.y, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){nn.x, NGHOST, nn.z}, //
|
||||
(int3){NGHOST, nn.y, nn.z}, //
|
||||
(int3){nn.x, nn.y, nn.z},
|
||||
};
|
||||
|
||||
// Edges X
|
||||
const int3 edgex_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){NGHOST, nn.y, nn.z}, //
|
||||
};
|
||||
|
||||
// Edges Y
|
||||
const int3 edgey_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){nn.x, NGHOST, nn.z}, //
|
||||
};
|
||||
|
||||
// Edges Z
|
||||
const int3 edgez_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
(int3){nn.x, nn.y, NGHOST}, //
|
||||
};
|
||||
|
||||
// Sides XY
|
||||
const int3 sidexy_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
};
|
||||
|
||||
// Sides XZ
|
||||
const int3 sidexz_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
};
|
||||
|
||||
// Sides YZ
|
||||
const int3 sideyz_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
};
|
||||
|
||||
const int3 corner_dims = (int3){NGHOST, NGHOST, NGHOST};
|
||||
const int3 edgex_dims = (int3){nn.x, NGHOST, NGHOST};
|
||||
const int3 edgey_dims = (int3){NGHOST, nn.y, NGHOST};
|
||||
const int3 edgez_dims = (int3){NGHOST, NGHOST, nn.z};
|
||||
const int3 sidexy_dims = (int3){nn.x, nn.y, NGHOST};
|
||||
const int3 sidexz_dims = (int3){nn.x, NGHOST, nn.z};
|
||||
const int3 sideyz_dims = (int3){NGHOST, nn.y, nn.z};
|
||||
// Create CommData
|
||||
// We have 8 corners, 12 edges, and 6 sides
|
||||
//
|
||||
// For simplicity's sake all data blocks inside a single CommData struct
|
||||
// have the same dimensions.
|
||||
grid.nn = nn;
|
||||
grid.corner_data = acCreateCommData(device, corner_dims, ARRAY_SIZE(corner_a0s));
|
||||
grid.edgex_data = acCreateCommData(device, edgex_dims, ARRAY_SIZE(edgex_a0s));
|
||||
grid.edgey_data = acCreateCommData(device, edgey_dims, ARRAY_SIZE(edgey_a0s));
|
||||
grid.edgez_data = acCreateCommData(device, edgez_dims, ARRAY_SIZE(edgez_a0s));
|
||||
grid.sidexy_data = acCreateCommData(device, sidexy_dims, ARRAY_SIZE(sidexy_a0s));
|
||||
grid.sidexz_data = acCreateCommData(device, sidexz_dims, ARRAY_SIZE(sidexz_a0s));
|
||||
grid.sideyz_data = acCreateCommData(device, sideyz_dims, ARRAY_SIZE(sideyz_a0s));
|
||||
grid.corner_data = acCreateCommData(device, (int3){NGHOST, NGHOST, NGHOST}, 8);
|
||||
grid.edgex_data = acCreateCommData(device, (int3){nn.x, NGHOST, NGHOST}, 4);
|
||||
grid.edgey_data = acCreateCommData(device, (int3){NGHOST, nn.y, NGHOST}, 4);
|
||||
grid.edgez_data = acCreateCommData(device, (int3){NGHOST, NGHOST, nn.z}, 4);
|
||||
grid.sidexy_data = acCreateCommData(device, (int3){nn.x, nn.y, NGHOST}, 2);
|
||||
grid.sidexz_data = acCreateCommData(device, (int3){nn.x, NGHOST, nn.z}, 2);
|
||||
grid.sideyz_data = acCreateCommData(device, (int3){NGHOST, nn.y, nn.z}, 2);
|
||||
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
return AC_SUCCESS;
|
||||
@@ -1182,7 +1244,7 @@ acGridQuit(void)
|
||||
acDestroyCommData(grid.device, &grid.sideyz_data);
|
||||
|
||||
grid.initialized = false;
|
||||
grid.decomposition = (int3){-1, -1, -1};
|
||||
grid.decomposition = (uint3_64){0, 0, 0};
|
||||
acMeshDestroy(&grid.submesh);
|
||||
acDeviceDestroy(grid.device);
|
||||
|
||||
@@ -1220,7 +1282,7 @@ AcResult
|
||||
acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
{
|
||||
ERRCHK(grid.initialized);
|
||||
//acGridSynchronizeStream(stream);
|
||||
// acGridSynchronizeStream(stream);
|
||||
|
||||
const Device device = grid.device;
|
||||
const int3 nn = grid.nn;
|
||||
@@ -1235,37 +1297,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
acDeviceSynchronizeStream(device, stream);
|
||||
|
||||
// Corners
|
||||
const int3 corner_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
(int3){nn.x, nn.y, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){nn.x, NGHOST, nn.z}, //
|
||||
(int3){NGHOST, nn.y, nn.z}, //
|
||||
(int3){nn.x, nn.y, nn.z},
|
||||
};
|
||||
const int3 corner_b0s[] = {
|
||||
(int3){0, 0, 0},
|
||||
(int3){NGHOST + nn.x, 0, 0},
|
||||
(int3){0, NGHOST + nn.y, 0},
|
||||
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
|
||||
|
||||
(int3){0, 0, NGHOST + nn.z},
|
||||
|
||||
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
|
||||
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
|
||||
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
|
||||
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
|
||||
};
|
||||
|
||||
// Edges X
|
||||
const int3 edgex_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){NGHOST, nn.y, nn.z}, //
|
||||
};
|
||||
const int3 edgex_b0s[] = {
|
||||
(int3){NGHOST, 0, 0},
|
||||
(int3){NGHOST, NGHOST + nn.y, 0},
|
||||
@@ -1275,13 +1319,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
};
|
||||
|
||||
// Edges Y
|
||||
const int3 edgey_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){nn.x, NGHOST, nn.z}, //
|
||||
};
|
||||
const int3 edgey_b0s[] = {
|
||||
(int3){0, NGHOST, 0},
|
||||
(int3){NGHOST + nn.x, NGHOST, 0},
|
||||
@@ -1291,13 +1328,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
};
|
||||
|
||||
// Edges Z
|
||||
const int3 edgez_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
(int3){nn.x, nn.y, NGHOST}, //
|
||||
};
|
||||
const int3 edgez_b0s[] = {
|
||||
(int3){0, 0, NGHOST},
|
||||
(int3){NGHOST + nn.x, 0, NGHOST},
|
||||
@@ -1307,63 +1337,31 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
};
|
||||
|
||||
// Sides XY
|
||||
const int3 sidexy_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
};
|
||||
const int3 sidexy_b0s[] = {
|
||||
(int3){NGHOST, NGHOST, 0}, //
|
||||
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
|
||||
};
|
||||
|
||||
// Sides XZ
|
||||
const int3 sidexz_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
};
|
||||
const int3 sidexz_b0s[] = {
|
||||
(int3){NGHOST, 0, NGHOST}, //
|
||||
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
|
||||
};
|
||||
|
||||
// Sides YZ
|
||||
const int3 sideyz_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
};
|
||||
const int3 sideyz_b0s[] = {
|
||||
(int3){0, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
|
||||
};
|
||||
|
||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||
acPackCommData(device, corner_a0s, &corner_data);
|
||||
acPackCommData(device, edgex_a0s, &edgex_data);
|
||||
acPackCommData(device, edgey_a0s, &edgey_data);
|
||||
acPackCommData(device, edgez_a0s, &edgez_data);
|
||||
acPackCommData(device, sidexy_a0s, &sidexy_data);
|
||||
acPackCommData(device, sidexz_a0s, &sidexz_data);
|
||||
acPackCommData(device, sideyz_a0s, &sideyz_data);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
#if MPI_GPUDIRECT_DISABLED
|
||||
acTransferCommDataToHost(device, &corner_data);
|
||||
acTransferCommDataToHost(device, &edgex_data);
|
||||
acTransferCommDataToHost(device, &edgey_data);
|
||||
acTransferCommDataToHost(device, &edgez_data);
|
||||
acTransferCommDataToHost(device, &sidexy_data);
|
||||
acTransferCommDataToHost(device, &sidexz_data);
|
||||
acTransferCommDataToHost(device, &sideyz_data);
|
||||
#endif
|
||||
|
||||
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
|
||||
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
|
||||
acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
|
||||
acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
|
||||
acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
|
||||
acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
|
||||
acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
|
||||
// acPackCommData(device, corner_b0s, &corner_data);
|
||||
acPackCommData(device, edgex_b0s, &edgex_data);
|
||||
acPackCommData(device, edgey_b0s, &edgey_data);
|
||||
acPackCommData(device, edgez_b0s, &edgez_data);
|
||||
acPackCommData(device, sidexy_b0s, &sidexy_data);
|
||||
acPackCommData(device, sidexz_b0s, &sidexz_data);
|
||||
acPackCommData(device, sideyz_b0s, &sideyz_data);
|
||||
|
||||
//////////// INNER INTEGRATION //////////////
|
||||
{
|
||||
@@ -1373,7 +1371,27 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
}
|
||||
////////////////////////////////////////////
|
||||
|
||||
acTransferCommDataWait(corner_data);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
#if MPI_GPUDIRECT_DISABLED
|
||||
// acTransferCommDataToHost(device, &corner_data);
|
||||
acTransferCommDataToHost(device, &edgex_data);
|
||||
acTransferCommDataToHost(device, &edgey_data);
|
||||
acTransferCommDataToHost(device, &edgez_data);
|
||||
acTransferCommDataToHost(device, &sidexy_data);
|
||||
acTransferCommDataToHost(device, &sidexz_data);
|
||||
acTransferCommDataToHost(device, &sideyz_data);
|
||||
#endif
|
||||
|
||||
// acTransferCommData(device, corner_b0s, &corner_data);
|
||||
acTransferCommData(device, edgex_b0s, &edgex_data);
|
||||
acTransferCommData(device, edgey_b0s, &edgey_data);
|
||||
acTransferCommData(device, edgez_b0s, &edgez_data);
|
||||
acTransferCommData(device, sidexy_b0s, &sidexy_data);
|
||||
acTransferCommData(device, sidexz_b0s, &sidexz_data);
|
||||
acTransferCommData(device, sideyz_b0s, &sideyz_data);
|
||||
|
||||
// acTransferCommDataWait(corner_data);
|
||||
acTransferCommDataWait(edgex_data);
|
||||
acTransferCommDataWait(edgey_data);
|
||||
acTransferCommDataWait(edgez_data);
|
||||
@@ -1382,7 +1400,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
acTransferCommDataWait(sideyz_data);
|
||||
|
||||
#if MPI_GPUDIRECT_DISABLED
|
||||
acTransferCommDataToDevice(device, &corner_data);
|
||||
// acTransferCommDataToDevice(device, &corner_data);
|
||||
acTransferCommDataToDevice(device, &edgex_data);
|
||||
acTransferCommDataToDevice(device, &edgey_data);
|
||||
acTransferCommDataToDevice(device, &edgez_data);
|
||||
@@ -1391,7 +1409,15 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
acTransferCommDataToDevice(device, &sideyz_data);
|
||||
#endif
|
||||
|
||||
acUnpackCommData(device, corner_b0s, &corner_data);
|
||||
// acUnpinCommData(device, &corner_data);
|
||||
acUnpinCommData(device, &edgex_data);
|
||||
acUnpinCommData(device, &edgey_data);
|
||||
acUnpinCommData(device, &edgez_data);
|
||||
acUnpinCommData(device, &sidexy_data);
|
||||
acUnpinCommData(device, &sidexz_data);
|
||||
acUnpinCommData(device, &sideyz_data);
|
||||
|
||||
// acUnpackCommData(device, corner_b0s, &corner_data);
|
||||
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
||||
acUnpackCommData(device, edgey_b0s, &edgey_data);
|
||||
acUnpackCommData(device, edgez_b0s, &edgez_data);
|
||||
@@ -1401,7 +1427,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
||||
//////////// OUTER INTEGRATION //////////////
|
||||
|
||||
// Wait for unpacking
|
||||
acSyncCommData(corner_data);
|
||||
// acSyncCommData(corner_data);
|
||||
acSyncCommData(edgex_data);
|
||||
acSyncCommData(edgey_data);
|
||||
acSyncCommData(edgez_data);
|
||||
@@ -1463,37 +1489,19 @@ acGridPeriodicBoundconds(const Stream stream)
|
||||
CommData sideyz_data = grid.sideyz_data;
|
||||
|
||||
// Corners
|
||||
const int3 corner_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
(int3){nn.x, nn.y, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){nn.x, NGHOST, nn.z}, //
|
||||
(int3){NGHOST, nn.y, nn.z}, //
|
||||
(int3){nn.x, nn.y, nn.z},
|
||||
};
|
||||
const int3 corner_b0s[] = {
|
||||
(int3){0, 0, 0},
|
||||
(int3){NGHOST + nn.x, 0, 0},
|
||||
(int3){0, NGHOST + nn.y, 0},
|
||||
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
|
||||
|
||||
(int3){0, 0, NGHOST + nn.z},
|
||||
|
||||
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
|
||||
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
|
||||
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
|
||||
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
|
||||
};
|
||||
|
||||
// Edges X
|
||||
const int3 edgex_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){NGHOST, nn.y, nn.z}, //
|
||||
};
|
||||
const int3 edgex_b0s[] = {
|
||||
(int3){NGHOST, 0, 0},
|
||||
(int3){NGHOST, NGHOST + nn.y, 0},
|
||||
@@ -1503,13 +1511,6 @@ acGridPeriodicBoundconds(const Stream stream)
|
||||
};
|
||||
|
||||
// Edges Y
|
||||
const int3 edgey_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
(int3){nn.x, NGHOST, nn.z}, //
|
||||
};
|
||||
const int3 edgey_b0s[] = {
|
||||
(int3){0, NGHOST, 0},
|
||||
(int3){NGHOST + nn.x, NGHOST, 0},
|
||||
@@ -1519,13 +1520,6 @@ acGridPeriodicBoundconds(const Stream stream)
|
||||
};
|
||||
|
||||
// Edges Z
|
||||
const int3 edgez_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
(int3){nn.x, nn.y, NGHOST}, //
|
||||
};
|
||||
const int3 edgez_b0s[] = {
|
||||
(int3){0, 0, NGHOST},
|
||||
(int3){NGHOST + nn.x, 0, NGHOST},
|
||||
@@ -1535,42 +1529,32 @@ acGridPeriodicBoundconds(const Stream stream)
|
||||
};
|
||||
|
||||
// Sides XY
|
||||
const int3 sidexy_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, NGHOST, nn.z}, //
|
||||
};
|
||||
const int3 sidexy_b0s[] = {
|
||||
(int3){NGHOST, NGHOST, 0}, //
|
||||
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
|
||||
};
|
||||
|
||||
// Sides XZ
|
||||
const int3 sidexz_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST, nn.y, NGHOST}, //
|
||||
};
|
||||
const int3 sidexz_b0s[] = {
|
||||
(int3){NGHOST, 0, NGHOST}, //
|
||||
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
|
||||
};
|
||||
|
||||
// Sides YZ
|
||||
const int3 sideyz_a0s[] = {
|
||||
(int3){NGHOST, NGHOST, NGHOST}, //
|
||||
(int3){nn.x, NGHOST, NGHOST}, //
|
||||
};
|
||||
const int3 sideyz_b0s[] = {
|
||||
(int3){0, NGHOST, NGHOST}, //
|
||||
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
|
||||
};
|
||||
|
||||
acPackCommData(device, corner_a0s, &corner_data);
|
||||
acPackCommData(device, edgex_a0s, &edgex_data);
|
||||
acPackCommData(device, edgey_a0s, &edgey_data);
|
||||
acPackCommData(device, edgez_a0s, &edgez_data);
|
||||
acPackCommData(device, sidexy_a0s, &sidexy_data);
|
||||
acPackCommData(device, sidexz_a0s, &sidexz_data);
|
||||
acPackCommData(device, sideyz_a0s, &sideyz_data);
|
||||
acPackCommData(device, corner_b0s, &corner_data);
|
||||
acPackCommData(device, edgex_b0s, &edgex_data);
|
||||
acPackCommData(device, edgey_b0s, &edgey_data);
|
||||
acPackCommData(device, edgez_b0s, &edgez_data);
|
||||
acPackCommData(device, sidexy_b0s, &sidexy_data);
|
||||
acPackCommData(device, sidexz_b0s, &sidexz_data);
|
||||
acPackCommData(device, sideyz_b0s, &sideyz_data);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
#if MPI_GPUDIRECT_DISABLED
|
||||
acTransferCommDataToHost(device, &corner_data);
|
||||
@@ -1582,13 +1566,13 @@ acGridPeriodicBoundconds(const Stream stream)
|
||||
acTransferCommDataToHost(device, &sideyz_data);
|
||||
#endif
|
||||
|
||||
acTransferCommData(device, corner_a0s, corner_b0s, &corner_data);
|
||||
acTransferCommData(device, edgex_a0s, edgex_b0s, &edgex_data);
|
||||
acTransferCommData(device, edgey_a0s, edgey_b0s, &edgey_data);
|
||||
acTransferCommData(device, edgez_a0s, edgez_b0s, &edgez_data);
|
||||
acTransferCommData(device, sidexy_a0s, sidexy_b0s, &sidexy_data);
|
||||
acTransferCommData(device, sidexz_a0s, sidexz_b0s, &sidexz_data);
|
||||
acTransferCommData(device, sideyz_a0s, sideyz_b0s, &sideyz_data);
|
||||
acTransferCommData(device, corner_b0s, &corner_data);
|
||||
acTransferCommData(device, edgex_b0s, &edgex_data);
|
||||
acTransferCommData(device, edgey_b0s, &edgey_data);
|
||||
acTransferCommData(device, edgez_b0s, &edgez_data);
|
||||
acTransferCommData(device, sidexy_b0s, &sidexy_data);
|
||||
acTransferCommData(device, sidexz_b0s, &sidexz_data);
|
||||
acTransferCommData(device, sideyz_b0s, &sideyz_data);
|
||||
|
||||
acTransferCommDataWait(corner_data);
|
||||
acTransferCommDataWait(edgex_data);
|
||||
@@ -1608,6 +1592,14 @@ acGridPeriodicBoundconds(const Stream stream)
|
||||
acTransferCommDataToDevice(device, &sideyz_data);
|
||||
#endif
|
||||
|
||||
acUnpinCommData(device, &corner_data);
|
||||
acUnpinCommData(device, &edgex_data);
|
||||
acUnpinCommData(device, &edgey_data);
|
||||
acUnpinCommData(device, &edgez_data);
|
||||
acUnpinCommData(device, &sidexy_data);
|
||||
acUnpinCommData(device, &sidexz_data);
|
||||
acUnpinCommData(device, &sideyz_data);
|
||||
|
||||
acUnpackCommData(device, corner_b0s, &corner_data);
|
||||
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
||||
acUnpackCommData(device, edgey_b0s, &edgey_data);
|
||||
|
@@ -1,9 +1,19 @@
|
||||
#pragma once
|
||||
#include "astaroth.h"
|
||||
|
||||
#if AC_MPI_ENABLED
|
||||
#include <mpi.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define MPI_GPUDIRECT_DISABLED (0)
|
||||
#endif // AC_MPI_ENABLED
|
||||
|
||||
typedef struct {
|
||||
int3 dims;
|
||||
AcReal* data;
|
||||
|
||||
AcReal* data_pinned;
|
||||
bool pinned = false; // Set if data was received to pinned memory
|
||||
} PackedData;
|
||||
|
||||
typedef struct {
|
||||
|
Reference in New Issue
Block a user