Files
astaroth/src/core/device.cc

1707 lines
56 KiB
C++

#include "astaroth.h"
#include <string.h>
#include "astaroth_utils.h"
#include "errchk.h"
#include "math_utils.h"
#include "timer_hires.h"
#include "kernels/kernels.h"
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
#define MPI_GPUDIRECT_DISABLED (0)
AcResult
acDevicePrintInfo(const Device device)
{
const int device_id = device->id;
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device_id);
printf("--------------------------------------------------\n");
printf("Device Number: %d\n", device_id);
const size_t bus_id_max_len = 128;
char bus_id[bus_id_max_len];
cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
printf(" PCI bus ID: %s\n", bus_id);
printf(" Device name: %s\n", props.name);
printf(" Compute capability: %d.%d\n", props.major, props.minor);
// Compute
printf(" Compute\n");
printf(" Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
printf(" Stream processors: %d\n", props.multiProcessorCount);
printf(" SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
printf(
" Compute mode: %d\n",
(int)props
.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
// Memory
printf(" Global memory\n");
printf(" Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
printf(" Memory Bus Width (bits): %d\n", props.memoryBusWidth);
printf(" Peak Memory Bandwidth (GiB/s): %f\n",
2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
printf(" ECC enabled: %d\n", props.ECCEnabled);
// Memory usage
size_t free_bytes, total_bytes;
cudaMemGetInfo(&free_bytes, &total_bytes);
const size_t used_bytes = total_bytes - free_bytes;
printf(" Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
printf(" Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
printf(" Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
printf(" Caches\n");
printf(" Local L1 cache supported: %d\n", props.localL1CacheSupported);
printf(" Global L1 cache supported: %d\n", props.globalL1CacheSupported);
printf(" L2 size: %d KiB\n", props.l2CacheSize / (1024));
// MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
// MV: while compiling in TIARA gp cluster. Therefore commeted out.
//!! printf(" Total const mem: %ld KiB\n", props.totalConstMem / (1024));
//!! printf(" Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
printf(" Other\n");
printf(" Warp size: %d\n", props.warpSize);
// printf(" Single to double perf. ratio: %dx\n",
// props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
// versions
printf(" Stream priorities supported: %d\n", props.streamPrioritiesSupported);
printf("--------------------------------------------------\n");
return AC_SUCCESS;
}
AcResult
acDeviceAutoOptimize(const Device device)
{
cudaSetDevice(device->id);
const int3 start = (int3){
device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min],
};
const int3 end = (int3){
device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max],
};
return acKernelAutoOptimizeIntegration(start, end, device->vba);
}
AcResult
acDeviceSynchronizeStream(const Device device, const Stream stream)
{
cudaSetDevice(device->id);
if (stream == STREAM_ALL) {
cudaDeviceSynchronize();
}
else {
cudaStreamSynchronize(device->streams[stream]);
}
return AC_SUCCESS;
}
AcResult
acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
{
cudaSetDevice(id);
// cudaDeviceReset(); // Would be good for safety, but messes stuff up if we want to emulate
// multiple devices with a single GPU
// Create Device
struct device_s* device = (struct device_s*)malloc(sizeof(*device));
ERRCHK_ALWAYS(device);
device->id = id;
device->local_config = device_config;
acDevicePrintInfo(device);
// Check that the code was compiled for the proper GPU architecture
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
"device supports the CUDA architecture you are compiling for.\n"
"Running dummy kernel... ");
fflush(stdout);
acKernelDummy();
printf("Success!\n");
// Concurrency
for (int i = 0; i < NUM_STREAMS; ++i) {
cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, i);
}
// Memory
// VBA in/out
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.in[i], vba_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.out[i], vba_size_bytes));
}
// VBA Profiles
const size_t profile_size_bytes = sizeof(AcReal) * max(device_config.int_params[AC_mx],
max(device_config.int_params[AC_my],
device_config.int_params[AC_mz]));
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->vba.profiles[i], profile_size_bytes));
}
// Reductions
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad,
acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
// Device constants
acDeviceLoadDefaultUniforms(device);
acDeviceLoadMeshInfo(device, device_config);
printf("Created device %d (%p)\n", device->id, device);
*device_handle = device;
// Autoptimize
if (id == 0) {
acDeviceAutoOptimize(device);
}
return AC_SUCCESS;
}
AcResult
acDeviceDestroy(Device device)
{
cudaSetDevice(device->id);
printf("Destroying device %d (%p)\n", device->id, device);
acDeviceSynchronizeStream(device, STREAM_ALL);
// Memory
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
cudaFree(device->vba.in[i]);
cudaFree(device->vba.out[i]);
}
for (int i = 0; i < NUM_SCALARARRAY_HANDLES; ++i) {
cudaFree(device->vba.profiles[i]);
}
cudaFree(device->reduce_scratchpad);
cudaFree(device->reduce_result);
// Concurrency
for (int i = 0; i < NUM_STREAMS; ++i) {
cudaStreamDestroy(device->streams[i]);
}
// Destroy Device
free(device);
return AC_SUCCESS;
}
AcResult
acDeviceSwapBuffers(const Device device)
{
cudaSetDevice(device->id);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
AcReal* tmp = device->vba.in[i];
device->vba.in[i] = device->vba.out[i];
device->vba.out[i] = tmp;
}
return AC_SUCCESS;
}
AcResult
acDeviceLoadScalarArray(const Device device, const Stream stream, const ScalarArrayHandle handle,
const size_t start, const AcReal* data, const size_t num)
{
cudaSetDevice(device->id);
if (handle >= NUM_SCALARARRAY_HANDLES || !NUM_SCALARARRAY_HANDLES)
return AC_FAILURE;
ERRCHK((int)(start + num) <= max(device->local_config.int_params[AC_mx],
max(device->local_config.int_params[AC_my],
device->local_config.int_params[AC_mz])));
ERRCHK_ALWAYS(handle < NUM_SCALARARRAY_HANDLES);
ERRCHK_CUDA(cudaMemcpyAsync(&device->vba.profiles[handle][start], data, sizeof(data[0]) * num,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices)
{
cudaSetDevice(device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &device->vba.in[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA( //
cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
);
return AC_SUCCESS;
}
AcResult
acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
const int3 src, const int3 dst, const int num_vertices)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
dst, num_vertices);
}
return AC_SUCCESS;
}
AcResult
acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle)
{
const int3 src = (int3){0, 0, 0};
const int3 dst = src;
const size_t num_vertices = acVertexBufferSize(device->local_config);
acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst,
num_vertices);
return AC_SUCCESS;
}
AcResult
acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
}
return AC_SUCCESS;
}
AcResult
acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
cudaSetDevice(device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA( //
cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
);
return AC_SUCCESS;
}
AcResult
acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
num_vertices, host_mesh);
}
return AC_SUCCESS;
}
AcResult
acDeviceStoreVertexBuffer(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
{
int3 src = (int3){0, 0, 0};
int3 dst = src;
const size_t num_vertices = acVertexBufferSize(device->local_config);
acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices,
host_mesh);
return AC_SUCCESS;
}
AcResult
acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
}
return AC_SUCCESS;
}
AcResult
acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices, Device dst_device)
{
cudaSetDevice(src_device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &dst_device->vba.in[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes,
src_device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, Device dst_device)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst,
num_vertices, dst_device);
}
return AC_SUCCESS;
}
AcResult
acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, Device dst_device)
{
int3 src = (int3){0, 0, 0};
int3 dst = src;
const size_t num_vertices = acVertexBufferSize(src_device->local_config);
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
num_vertices, dst_device);
return AC_SUCCESS;
}
AcResult
acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device);
}
return AC_SUCCESS;
}
AcResult
acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt)
{
cudaSetDevice(device->id);
acDeviceLoadScalarUniform(device, stream, AC_dt, dt);
return acKernelIntegrateSubstep(device->streams[stream], step_number, start, end, device->vba);
}
AcResult
acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 start,
const int3 end)
{
cudaSetDevice(device->id);
return acKernelPeriodicBoundconds(device->streams[stream], start, end,
device->vba.in[vtxbuf_handle]);
}
AcResult
acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
}
return AC_SUCCESS;
}
AcResult
acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
cudaSetDevice(device->id);
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min]};
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = acKernelReduceScal(device->streams[stream], rtype, start, end,
device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
device->reduce_result);
return AC_SUCCESS;
}
AcResult
acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result)
{
cudaSetDevice(device->id);
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min]};
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS;
}
#if AC_MPI_ENABLED
/**
Quick overview of the MPI implementation:
The halo is partitioned into segments. The first coordinate of a segment is b0.
The array containing multiple b0s is called... "b0s".
Each b0 maps to an index in the computational domain of some neighboring process a0.
We have a0 = mod(b0 - nghost, nn) + nghost.
Intuitively, we
1) Transform b0 into a coordinate system where (0, 0, 0) is the first index in
the comp domain.
2) Wrap the transformed b0 around nn (comp domain)
3) Transform b0 back to a coordinate system where (0, 0, 0) is the first index
in the ghost zone
struct PackedData is used for packing and unpacking. Holds the actual data in
the halo partition
struct CommData holds multiple PackedDatas for sending and receiving halo
partitions
struct Grid contains information about the local GPU device, decomposition, the
total mesh dimensions and CommDatas
Basic steps:
1) Distribute the mesh among ranks
2) Integrate & communicate
- start inner integration and at the same time, pack halo data and send it to neighbors
- once all halo data has been received, unpack and do outer integration
- sync and start again
3) Gather the mesh to rank 0 for postprocessing
*/
#include <mpi.h>
#include <stdint.h>
typedef struct {
uint64_t x, y, z;
} uint3_64;
static uint3_64
operator+(const uint3_64& a, const uint3_64& b)
{
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
}
static int3
make_int3(const uint3_64 a)
{
return (int3){(int)a.x, (int)a.y, (int)a.z};
}
static uint64_t
mod(const int a, const int b)
{
const int r = a % b;
return r < 0 ? r + b : r;
}
static uint3_64
morton3D(const uint64_t pid)
{
uint64_t i, j, k;
i = j = k = 0;
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << 3 * bit;
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
}
return (uint3_64){i, j, k};
}
static uint64_t
morton1D(const uint3_64 pid)
{
uint64_t i = 0;
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << bit;
i |= ((pid.x & mask) << 0) << 2 * bit;
i |= ((pid.y & mask) << 1) << 2 * bit;
i |= ((pid.z & mask) << 2) << 2 * bit;
}
return i;
}
static uint3_64
decompose(const uint64_t target)
{
// This is just so beautifully elegant. Complex and efficient decomposition
// in just one line of code.
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
return p;
}
static uint3_64
wrap(const int3 i, const uint3_64 n)
{
return (uint3_64){
mod(i.x, n.x),
mod(i.y, n.y),
mod(i.z, n.z),
};
}
static int
getPid(const int3 pid_raw, const uint3_64 decomp)
{
const uint3_64 pid = wrap(pid_raw, decomp);
return (int)morton1D(pid);
}
static int3
getPid3D(const uint64_t pid, const uint3_64 decomp)
{
const uint3_64 pid3D = morton3D(pid);
ERRCHK_ALWAYS(getPid(make_int3(pid3D), decomp) == (int)pid);
return (int3){(int)pid3D.x, (int)pid3D.y, (int)pid3D.z};
}
/** Assumes that contiguous pids are on the same node and there is one process per GPU. */
static inline bool
onTheSameNode(const uint64_t pid_a, const uint64_t pid_b)
{
int devices_per_node = -1;
cudaGetDeviceCount(&devices_per_node);
const uint64_t node_a = pid_a / devices_per_node;
const uint64_t node_b = pid_b / devices_per_node;
return node_a == node_b;
}
static PackedData
acCreatePackedData(const int3 dims)
{
PackedData data = {};
data.dims = dims;
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
// slower than pinned (38 ms vs. 125 ms)
return data;
}
static AcResult
acDestroyPackedData(PackedData* data)
{
cudaFree(data->data_pinned);
data->dims = (int3){-1, -1, -1};
cudaFree(data->data);
data->data = NULL;
return AC_SUCCESS;
}
#if MPI_GPUDIRECT_DISABLED
static PackedData
acCreatePackedDataHost(const int3 dims)
{
PackedData data = {};
data.dims = dims;
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
data.data = (AcReal*)malloc(bytes);
ERRCHK_ALWAYS(data.data);
return data;
}
static AcResult
acDestroyPackedDataHost(PackedData* data)
{
data->dims = (int3){-1, -1, -1};
free(data->data);
data->data = NULL;
return AC_SUCCESS;
}
static void
acTransferPackedDataToHost(const Device device, const cudaStream_t stream, const PackedData ddata,
PackedData* hdata)
{
cudaSetDevice(device->id);
const size_t bytes = ddata.dims.x * ddata.dims.y * ddata.dims.z * sizeof(ddata.data[0]) *
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(hdata->data, ddata.data, bytes, cudaMemcpyDeviceToHost, stream));
}
static void
acTransferPackedDataToDevice(const Device device, const cudaStream_t stream, const PackedData hdata,
PackedData* ddata)
{
cudaSetDevice(device->id);
const size_t bytes = hdata.dims.x * hdata.dims.y * hdata.dims.z * sizeof(hdata.data[0]) *
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, hdata.data, bytes, cudaMemcpyHostToDevice, stream));
}
#endif // MPI_GPUDIRECT_DISABLED
static void
acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
{
cudaSetDevice(device->id);
// TODO sync stream
ddata->pinned = true;
const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data_pinned, ddata->data, bytes, cudaMemcpyDefault, stream));
}
static void
acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
{
if (!ddata->pinned) // Unpin iff the data was pinned previously
return;
cudaSetDevice(device->id);
// TODO sync stream
ddata->pinned = false;
const size_t bytes = ddata->dims.x * ddata->dims.y * ddata->dims.z * sizeof(ddata->data[0]) *
NUM_VTXBUF_HANDLES;
ERRCHK_CUDA(cudaMemcpyAsync(ddata->data, ddata->data_pinned, bytes, cudaMemcpyDefault, stream));
}
// TODO: do with packed data
static AcResult
acDeviceDistributeMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
{
MPI_Barrier(MPI_COMM_WORLD);
printf("Distributing mesh...\n");
fflush(stdout);
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
ERRCHK_ALWAYS(dst);
// Submesh nn
const int3 nn = (int3){
dst->info.int_params[AC_nx],
dst->info.int_params[AC_ny],
dst->info.int_params[AC_nz],
};
// Send to self
if (pid == 0) {
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
const int i = NGHOST;
const int count = nn.x;
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
&src.vertex_buffer[vtxbuf][src_idx], //
count * sizeof(src.vertex_buffer[i][0]));
}
}
}
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = NGHOST; k < NGHOST + nn.z; ++k) {
for (int j = NGHOST; j < NGHOST + nn.y; ++j) {
const int i = NGHOST;
const int count = nn.x;
if (pid != 0) {
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
// Recv
MPI_Status status;
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, 0, 0,
MPI_COMM_WORLD, &status);
}
else {
for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
const int src_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
j + tgt_pid3d.y * nn.y, //
k + tgt_pid3d.z * nn.z, //
src.info);
// Send
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, tgt_pid, 0,
MPI_COMM_WORLD);
}
}
}
}
}
return AC_SUCCESS;
}
// TODO: do with packed data
static AcResult
acDeviceGatherMeshMPI(const AcMesh src, const uint3_64 decomposition, AcMesh* dst)
{
MPI_Barrier(MPI_COMM_WORLD);
printf("Gathering mesh...\n");
fflush(stdout);
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
if (pid == 0)
ERRCHK_ALWAYS(dst);
// Submesh nn
const int3 nn = (int3){
src.info.int_params[AC_nx],
src.info.int_params[AC_ny],
src.info.int_params[AC_nz],
};
// Submesh mm
const int3 mm = (int3){
src.info.int_params[AC_mx],
src.info.int_params[AC_my],
src.info.int_params[AC_mz],
};
// Send to self
if (pid == 0) {
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = 0; k < mm.z; ++k) {
for (int j = 0; j < mm.y; ++j) {
const int i = 0;
const int count = mm.x;
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
const int dst_idx = acVertexBufferIdx(i, j, k, dst->info);
memcpy(&dst->vertex_buffer[vtxbuf][dst_idx], //
&src.vertex_buffer[vtxbuf][src_idx], //
count * sizeof(src.vertex_buffer[i][0]));
}
}
}
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
// For pencils
for (int k = 0; k < mm.z; ++k) {
for (int j = 0; j < mm.y; ++j) {
const int i = 0;
const int count = mm.x;
if (pid != 0) {
// Send
const int src_idx = acVertexBufferIdx(i, j, k, src.info);
MPI_Send(&src.vertex_buffer[vtxbuf][src_idx], count, datatype, 0, 0,
MPI_COMM_WORLD);
}
else {
for (int tgt_pid = 1; tgt_pid < nprocs; ++tgt_pid) {
const int3 tgt_pid3d = getPid3D(tgt_pid, decomposition);
const int dst_idx = acVertexBufferIdx(i + tgt_pid3d.x * nn.x, //
j + tgt_pid3d.y * nn.y, //
k + tgt_pid3d.z * nn.z, //
dst->info);
// Recv
MPI_Status status;
MPI_Recv(&dst->vertex_buffer[vtxbuf][dst_idx], count, datatype, tgt_pid, 0,
MPI_COMM_WORLD, &status);
}
}
}
}
}
return AC_SUCCESS;
}
typedef struct {
PackedData* srcs;
PackedData* dsts;
#if MPI_GPUDIRECT_DISABLED
PackedData* srcs_host;
PackedData* dsts_host;
#endif
int3 dims;
size_t count;
cudaStream_t* streams;
MPI_Request* send_reqs;
MPI_Request* recv_reqs;
} CommData;
static CommData
acCreateCommData(const Device device, const int3 dims, const size_t count)
{
cudaSetDevice(device->id);
CommData data = {};
data.srcs = (PackedData*)malloc(count * sizeof(PackedData));
data.dsts = (PackedData*)malloc(count * sizeof(PackedData));
data.dims = dims;
data.count = count;
data.streams = (cudaStream_t*)malloc(count * sizeof(cudaStream_t));
data.send_reqs = (MPI_Request*)malloc(count * sizeof(MPI_Request));
data.recv_reqs = (MPI_Request*)malloc(count * sizeof(MPI_Request));
ERRCHK_ALWAYS(data.srcs);
ERRCHK_ALWAYS(data.dsts);
ERRCHK_ALWAYS(data.send_reqs);
ERRCHK_ALWAYS(data.recv_reqs);
#if MPI_GPUDIRECT_DISABLED
data.srcs_host = (PackedData*)malloc(count * sizeof(PackedData));
data.dsts_host = (PackedData*)malloc(count * sizeof(PackedData));
ERRCHK_ALWAYS(data.srcs_host);
ERRCHK_ALWAYS(data.dsts_host);
#endif
for (size_t i = 0; i < count; ++i) {
data.srcs[i] = acCreatePackedData(dims);
data.dsts[i] = acCreatePackedData(dims);
#if MPI_GPUDIRECT_DISABLED
data.srcs_host[i] = acCreatePackedDataHost(dims);
data.dsts_host[i] = acCreatePackedDataHost(dims);
#endif
int low_prio, high_prio;
cudaDeviceGetStreamPriorityRange(&low_prio, &high_prio);
cudaStreamCreateWithPriority(&data.streams[i], cudaStreamNonBlocking, high_prio);
}
return data;
}
static void
acDestroyCommData(const Device device, CommData* data)
{
cudaSetDevice(device->id);
for (size_t i = 0; i < data->count; ++i) {
acDestroyPackedData(&data->srcs[i]);
acDestroyPackedData(&data->dsts[i]);
#if MPI_GPUDIRECT_DISABLED
acDestroyPackedDataHost(&data->srcs_host[i]);
acDestroyPackedDataHost(&data->dsts_host[i]);
#endif
cudaStreamDestroy(data->streams[i]);
}
free(data->srcs);
free(data->dsts);
#if MPI_GPUDIRECT_DISABLED
free(data->srcs_host);
free(data->dsts_host);
#endif
free(data->streams);
free(data->send_reqs);
free(data->recv_reqs);
data->count = -1;
data->dims = (int3){-1, -1, -1};
}
static void
acSyncCommData(const CommData data)
{
for (size_t i = 0; i < data.count; ++i)
cudaStreamSynchronize(data.streams[i]);
}
static int3
mod(const int3 a, const int3 n)
{
return (int3){(int)mod(a.x, n.x), (int)mod(a.y, n.y), (int)mod(a.z, n.z)};
}
static void
acPackCommData(const Device device, const int3* b0s, CommData* data)
{
cudaSetDevice(device->id);
const int3 nn = (int3){
device->local_config.int_params[AC_nx],
device->local_config.int_params[AC_ny],
device->local_config.int_params[AC_nz],
};
const int3 nghost = (int3){NGHOST, NGHOST, NGHOST};
for (size_t i = 0; i < data->count; ++i) {
const int3 a0 = mod(b0s[i] - nghost, nn) + nghost;
acKernelPackData(data->streams[i], device->vba, a0, data->srcs[i]);
}
}
static void
acUnpackCommData(const Device device, const int3* b0s, CommData* data)
{
cudaSetDevice(device->id);
for (size_t i = 0; i < data->count; ++i)
acKernelUnpackData(data->streams[i], data->dsts[i], b0s[i], device->vba);
}
#if MPI_GPUDIRECT_DISABLED
static void
acTransferCommDataToHost(const Device device, CommData* data)
{
cudaSetDevice(device->id);
for (size_t i = 0; i < data->count; ++i)
acTransferPackedDataToHost(device, data->streams[i], data->srcs[i], &data->srcs_host[i]);
}
static void
acTransferCommDataToDevice(const Device device, CommData* data)
{
cudaSetDevice(device->id);
for (size_t i = 0; i < data->count; ++i)
acTransferPackedDataToDevice(device, data->streams[i], data->dsts_host[i], &data->dsts[i]);
}
#endif
static inline void
acPinCommData(const Device device, CommData* data)
{
cudaSetDevice(device->id);
for (size_t i = 0; i < data->count; ++i)
acPinPackedData(device, data->streams[i], &data->srcs[i]);
}
static void
acUnpinCommData(const Device device, CommData* data)
{
cudaSetDevice(device->id);
// Clear pin flags from src
for (size_t i = 0; i < data->count; ++i)
data->srcs[i].pinned = false;
// Transfer from pinned to gmem
for (size_t i = 0; i < data->count; ++i)
acUnpinPackedData(device, data->streams[i], &data->dsts[i]);
}
static AcResult
acTransferCommData(const Device device, //
const int3* b0s, // Halo partition coordinates
CommData* data)
{
cudaSetDevice(device->id);
MPI_Datatype datatype = MPI_FLOAT;
if (sizeof(AcReal) == 8)
datatype = MPI_DOUBLE;
int nprocs, pid;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
const uint3_64 decomp = decompose(nprocs);
const int3 nn = (int3){
device->local_config.int_params[AC_nx],
device->local_config.int_params[AC_ny],
device->local_config.int_params[AC_nz],
};
const int3 pid3d = getPid3D(pid, decomp);
const int3 dims = data->dims;
const size_t blockcount = data->count;
const size_t count = dims.x * dims.y * dims.z * NUM_VTXBUF_HANDLES;
for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
const int3 b0 = b0s[b0_idx];
const int3 neighbor = (int3){
b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
};
const int npid = getPid(pid3d + neighbor, decomp);
PackedData* dst = &data->dsts[b0_idx];
if (onTheSameNode(pid, npid)) {
MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
dst->pinned = false;
}
else {
MPI_Irecv(dst->data_pinned, count, datatype, npid, b0_idx, //
MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
dst->pinned = true;
}
}
for (size_t b0_idx = 0; b0_idx < blockcount; ++b0_idx) {
const int3 b0 = b0s[b0_idx];
const int3 neighbor = (int3){
b0.x < NGHOST ? -1 : b0.x >= NGHOST + nn.x ? 1 : 0,
b0.y < NGHOST ? -1 : b0.y >= NGHOST + nn.y ? 1 : 0,
b0.z < NGHOST ? -1 : b0.z >= NGHOST + nn.z ? 1 : 0,
};
const int npid = getPid(pid3d - neighbor, decomp);
PackedData* src = &data->srcs[b0_idx];
if (onTheSameNode(pid, npid)) {
cudaStreamSynchronize(data->streams[b0_idx]);
MPI_Isend(src->data, count, datatype, npid, b0_idx, //
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
}
else {
acPinPackedData(device, data->streams[b0_idx], src);
cudaStreamSynchronize(data->streams[b0_idx]);
MPI_Isend(src->data_pinned, count, datatype, npid, b0_idx, //
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
}
}
return AC_SUCCESS;
}
static void
acTransferCommDataWait(const CommData data)
{
for (size_t i = 0; i < data.count; ++i) {
MPI_Wait(&data.send_reqs[i], MPI_STATUS_IGNORE);
MPI_Wait(&data.recv_reqs[i], MPI_STATUS_IGNORE);
}
}
typedef struct {
Device device;
AcMesh submesh;
uint3_64 decomposition;
bool initialized;
int3 nn;
CommData corner_data;
CommData edgex_data;
CommData edgey_data;
CommData edgez_data;
CommData sidexy_data;
CommData sidexz_data;
CommData sideyz_data;
} Grid;
static Grid grid = {};
AcResult
acGridSynchronizeStream(const Stream stream)
{
ERRCHK(grid.initialized);
acDeviceSynchronizeStream(grid.device, stream);
MPI_Barrier(MPI_COMM_WORLD);
return AC_SUCCESS;
}
AcResult
acGridInit(const AcMeshInfo info)
{
ERRCHK(!grid.initialized);
// Check that MPI is initialized
int nprocs, pid;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
printf("Processor %s. Process %d of %d.\n", processor_name, pid, nprocs);
// Decompose
AcMeshInfo submesh_info = info;
const uint3_64 decomposition = decompose(nprocs);
const int3 pid3d = getPid3D(pid, decomposition);
printf("Decomposition: %lu, %lu, %lu\n", decomposition.x, decomposition.y, decomposition.z);
printf("Process %d: (%d, %d, %d)\n", pid, pid3d.x, pid3d.y, pid3d.z);
ERRCHK_ALWAYS(info.int_params[AC_nx] % decomposition.x == 0);
ERRCHK_ALWAYS(info.int_params[AC_ny] % decomposition.y == 0);
ERRCHK_ALWAYS(info.int_params[AC_nz] % decomposition.z == 0);
const int submesh_nx = info.int_params[AC_nx] / decomposition.x;
const int submesh_ny = info.int_params[AC_ny] / decomposition.y;
const int submesh_nz = info.int_params[AC_nz] / decomposition.z;
submesh_info.int_params[AC_nx] = submesh_nx;
submesh_info.int_params[AC_ny] = submesh_ny;
submesh_info.int_params[AC_nz] = submesh_nz;
submesh_info.int3_params[AC_global_grid_n] = (int3){
info.int_params[AC_nx],
info.int_params[AC_ny],
info.int_params[AC_nz],
};
submesh_info.int3_params[AC_multigpu_offset] = pid3d *
(int3){submesh_nx, submesh_ny, submesh_nz};
acUpdateBuiltinParams(&submesh_info);
// GPU alloc
int devices_per_node = -1;
cudaGetDeviceCount(&devices_per_node);
Device device;
acDeviceCreate(pid % devices_per_node, submesh_info, &device);
// CPU alloc
AcMesh submesh;
acMeshCreate(submesh_info, &submesh);
// Setup the global grid structure
grid.device = device;
grid.submesh = submesh;
grid.decomposition = decomposition;
grid.initialized = true;
// Configure
const int3 nn = (int3){
device->local_config.int_params[AC_nx],
device->local_config.int_params[AC_ny],
device->local_config.int_params[AC_nz],
};
// Create CommData
// We have 8 corners, 12 edges, and 6 sides
//
// For simplicity's sake all data blocks inside a single CommData struct
// have the same dimensions.
grid.nn = nn;
grid.corner_data = acCreateCommData(device, (int3){NGHOST, NGHOST, NGHOST}, 8);
grid.edgex_data = acCreateCommData(device, (int3){nn.x, NGHOST, NGHOST}, 4);
grid.edgey_data = acCreateCommData(device, (int3){NGHOST, nn.y, NGHOST}, 4);
grid.edgez_data = acCreateCommData(device, (int3){NGHOST, NGHOST, nn.z}, 4);
grid.sidexy_data = acCreateCommData(device, (int3){nn.x, nn.y, NGHOST}, 2);
grid.sidexz_data = acCreateCommData(device, (int3){nn.x, NGHOST, nn.z}, 2);
grid.sideyz_data = acCreateCommData(device, (int3){NGHOST, nn.y, nn.z}, 2);
acGridSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acGridQuit(void)
{
ERRCHK(grid.initialized);
acGridSynchronizeStream(STREAM_ALL);
acDestroyCommData(grid.device, &grid.corner_data);
acDestroyCommData(grid.device, &grid.edgex_data);
acDestroyCommData(grid.device, &grid.edgey_data);
acDestroyCommData(grid.device, &grid.edgez_data);
acDestroyCommData(grid.device, &grid.sidexy_data);
acDestroyCommData(grid.device, &grid.sidexz_data);
acDestroyCommData(grid.device, &grid.sideyz_data);
grid.initialized = false;
grid.decomposition = (uint3_64){0, 0, 0};
acMeshDestroy(&grid.submesh);
acDeviceDestroy(grid.device);
acGridSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acGridLoadMesh(const AcMesh host_mesh, const Stream stream)
{
ERRCHK(grid.initialized);
acGridSynchronizeStream(stream);
acDeviceDistributeMeshMPI(host_mesh, grid.decomposition, &grid.submesh);
acDeviceLoadMesh(grid.device, stream, grid.submesh);
return AC_SUCCESS;
}
AcResult
acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
{
ERRCHK(grid.initialized);
acGridSynchronizeStream(stream);
acDeviceStoreMesh(grid.device, stream, &grid.submesh);
acGridSynchronizeStream(stream);
acDeviceGatherMeshMPI(grid.submesh, grid.decomposition, host_mesh);
return AC_SUCCESS;
}
AcResult
acGridIntegrate(const Stream stream, const AcReal dt)
{
ERRCHK(grid.initialized);
// acGridSynchronizeStream(stream);
const Device device = grid.device;
const int3 nn = grid.nn;
//CommData corner_data = grid.corner_data; // Do not rm: required for corners
CommData edgex_data = grid.edgex_data;
CommData edgey_data = grid.edgey_data;
CommData edgez_data = grid.edgez_data;
CommData sidexy_data = grid.sidexy_data;
CommData sidexz_data = grid.sidexz_data;
CommData sideyz_data = grid.sideyz_data;
acDeviceSynchronizeStream(device, stream);
// Corners
/*
// Do not rm: required for corners
const int3 corner_b0s[] = {
(int3){0, 0, 0},
(int3){NGHOST + nn.x, 0, 0},
(int3){0, NGHOST + nn.y, 0},
(int3){0, 0, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
};
*/
// Edges X
const int3 edgex_b0s[] = {
(int3){NGHOST, 0, 0},
(int3){NGHOST, NGHOST + nn.y, 0},
(int3){NGHOST, 0, NGHOST + nn.z},
(int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
};
// Edges Y
const int3 edgey_b0s[] = {
(int3){0, NGHOST, 0},
(int3){NGHOST + nn.x, NGHOST, 0},
(int3){0, NGHOST, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
};
// Edges Z
const int3 edgez_b0s[] = {
(int3){0, 0, NGHOST},
(int3){NGHOST + nn.x, 0, NGHOST},
(int3){0, NGHOST + nn.y, NGHOST},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
};
// Sides XY
const int3 sidexy_b0s[] = {
(int3){NGHOST, NGHOST, 0}, //
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
};
// Sides XZ
const int3 sidexz_b0s[] = {
(int3){NGHOST, 0, NGHOST}, //
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
};
// Sides YZ
const int3 sideyz_b0s[] = {
(int3){0, NGHOST, NGHOST}, //
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
};
for (int isubstep = 0; isubstep < 3; ++isubstep) {
// acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
acPackCommData(device, edgex_b0s, &edgex_data);
acPackCommData(device, edgey_b0s, &edgey_data);
acPackCommData(device, edgez_b0s, &edgez_data);
acPackCommData(device, sidexy_b0s, &sidexy_data);
acPackCommData(device, sidexz_b0s, &sidexz_data);
acPackCommData(device, sideyz_b0s, &sideyz_data);
//////////// INNER INTEGRATION //////////////
{
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = nn;
acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
}
////////////////////////////////////////////
MPI_Barrier(MPI_COMM_WORLD);
#if MPI_GPUDIRECT_DISABLED
// acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
acTransferCommDataToHost(device, &edgex_data);
acTransferCommDataToHost(device, &edgey_data);
acTransferCommDataToHost(device, &edgez_data);
acTransferCommDataToHost(device, &sidexy_data);
acTransferCommDataToHost(device, &sidexz_data);
acTransferCommDataToHost(device, &sideyz_data);
#endif
// acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
acTransferCommData(device, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_b0s, &edgez_data);
acTransferCommData(device, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_b0s, &sideyz_data);
// acTransferCommDataWait(corner_data); // Do not rm: required for corners
acTransferCommDataWait(edgex_data);
acTransferCommDataWait(edgey_data);
acTransferCommDataWait(edgez_data);
acTransferCommDataWait(sidexy_data);
acTransferCommDataWait(sidexz_data);
acTransferCommDataWait(sideyz_data);
#if MPI_GPUDIRECT_DISABLED
// acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners
acTransferCommDataToDevice(device, &edgex_data);
acTransferCommDataToDevice(device, &edgey_data);
acTransferCommDataToDevice(device, &edgez_data);
acTransferCommDataToDevice(device, &sidexy_data);
acTransferCommDataToDevice(device, &sidexz_data);
acTransferCommDataToDevice(device, &sideyz_data);
#endif
// acUnpinCommData(device, &corner_data); // Do not rm: required for corners
acUnpinCommData(device, &edgex_data);
acUnpinCommData(device, &edgey_data);
acUnpinCommData(device, &edgez_data);
acUnpinCommData(device, &sidexy_data);
acUnpinCommData(device, &sidexz_data);
acUnpinCommData(device, &sideyz_data);
// acUnpackCommData(device, corner_b0s, &corner_data);
acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data);
acUnpackCommData(device, edgez_b0s, &edgez_data);
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
//////////// OUTER INTEGRATION //////////////
// Wait for unpacking
// acSyncCommData(corner_data); // Do not rm: required for corners
acSyncCommData(edgex_data);
acSyncCommData(edgey_data);
acSyncCommData(edgez_data);
acSyncCommData(sidexy_data);
acSyncCommData(sidexz_data);
acSyncCommData(sideyz_data);
{ // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt);
}
{ // Back
const int3 m1 = (int3){NGHOST, NGHOST, nn.z};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt);
}
{ // Bottom
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt);
}
{ // Top
const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST};
const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt);
}
{ // Left
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt);
}
{ // Right
const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
}
acDeviceSwapBuffers(device);
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
////////////////////////////////////////////
}
return AC_SUCCESS;
}
AcResult
acGridPeriodicBoundconds(const Stream stream)
{
ERRCHK(grid.initialized);
acGridSynchronizeStream(stream);
const Device device = grid.device;
const int3 nn = grid.nn;
CommData corner_data = grid.corner_data;
CommData edgex_data = grid.edgex_data;
CommData edgey_data = grid.edgey_data;
CommData edgez_data = grid.edgez_data;
CommData sidexy_data = grid.sidexy_data;
CommData sidexz_data = grid.sidexz_data;
CommData sideyz_data = grid.sideyz_data;
// Corners
const int3 corner_b0s[] = {
(int3){0, 0, 0},
(int3){NGHOST + nn.x, 0, 0},
(int3){0, NGHOST + nn.y, 0},
(int3){0, 0, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, 0},
(int3){NGHOST + nn.x, 0, NGHOST + nn.z},
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
};
// Edges X
const int3 edgex_b0s[] = {
(int3){NGHOST, 0, 0},
(int3){NGHOST, NGHOST + nn.y, 0},
(int3){NGHOST, 0, NGHOST + nn.z},
(int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z},
};
// Edges Y
const int3 edgey_b0s[] = {
(int3){0, NGHOST, 0},
(int3){NGHOST + nn.x, NGHOST, 0},
(int3){0, NGHOST, NGHOST + nn.z},
(int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z},
};
// Edges Z
const int3 edgez_b0s[] = {
(int3){0, 0, NGHOST},
(int3){NGHOST + nn.x, 0, NGHOST},
(int3){0, NGHOST + nn.y, NGHOST},
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST},
};
// Sides XY
const int3 sidexy_b0s[] = {
(int3){NGHOST, NGHOST, 0}, //
(int3){NGHOST, NGHOST, NGHOST + nn.z}, //
};
// Sides XZ
const int3 sidexz_b0s[] = {
(int3){NGHOST, 0, NGHOST}, //
(int3){NGHOST, NGHOST + nn.y, NGHOST}, //
};
// Sides YZ
const int3 sideyz_b0s[] = {
(int3){0, NGHOST, NGHOST}, //
(int3){NGHOST + nn.x, NGHOST, NGHOST}, //
};
acPackCommData(device, corner_b0s, &corner_data);
acPackCommData(device, edgex_b0s, &edgex_data);
acPackCommData(device, edgey_b0s, &edgey_data);
acPackCommData(device, edgez_b0s, &edgez_data);
acPackCommData(device, sidexy_b0s, &sidexy_data);
acPackCommData(device, sidexz_b0s, &sidexz_data);
acPackCommData(device, sideyz_b0s, &sideyz_data);
MPI_Barrier(MPI_COMM_WORLD);
#if MPI_GPUDIRECT_DISABLED
acTransferCommDataToHost(device, &corner_data);
acTransferCommDataToHost(device, &edgex_data);
acTransferCommDataToHost(device, &edgey_data);
acTransferCommDataToHost(device, &edgez_data);
acTransferCommDataToHost(device, &sidexy_data);
acTransferCommDataToHost(device, &sidexz_data);
acTransferCommDataToHost(device, &sideyz_data);
#endif
acTransferCommData(device, corner_b0s, &corner_data);
acTransferCommData(device, edgex_b0s, &edgex_data);
acTransferCommData(device, edgey_b0s, &edgey_data);
acTransferCommData(device, edgez_b0s, &edgez_data);
acTransferCommData(device, sidexy_b0s, &sidexy_data);
acTransferCommData(device, sidexz_b0s, &sidexz_data);
acTransferCommData(device, sideyz_b0s, &sideyz_data);
acTransferCommDataWait(corner_data);
acTransferCommDataWait(edgex_data);
acTransferCommDataWait(edgey_data);
acTransferCommDataWait(edgez_data);
acTransferCommDataWait(sidexy_data);
acTransferCommDataWait(sidexz_data);
acTransferCommDataWait(sideyz_data);
#if MPI_GPUDIRECT_DISABLED
acTransferCommDataToDevice(device, &corner_data);
acTransferCommDataToDevice(device, &edgex_data);
acTransferCommDataToDevice(device, &edgey_data);
acTransferCommDataToDevice(device, &edgez_data);
acTransferCommDataToDevice(device, &sidexy_data);
acTransferCommDataToDevice(device, &sidexz_data);
acTransferCommDataToDevice(device, &sideyz_data);
#endif
acUnpinCommData(device, &corner_data);
acUnpinCommData(device, &edgex_data);
acUnpinCommData(device, &edgey_data);
acUnpinCommData(device, &edgez_data);
acUnpinCommData(device, &sidexy_data);
acUnpinCommData(device, &sidexz_data);
acUnpinCommData(device, &sideyz_data);
acUnpackCommData(device, corner_b0s, &corner_data);
acUnpackCommData(device, edgex_b0s, &edgex_data);
acUnpackCommData(device, edgey_b0s, &edgey_data);
acUnpackCommData(device, edgez_b0s, &edgez_data);
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
// Wait for unpacking
acSyncCommData(corner_data);
acSyncCommData(edgex_data);
acSyncCommData(edgey_data);
acSyncCommData(edgez_data);
acSyncCommData(sidexy_data);
acSyncCommData(sidexz_data);
acSyncCommData(sideyz_data);
return AC_SUCCESS;
}
static AcResult
acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* result)
{
MPI_Op op;
if (rtype == RTYPE_MAX) {
op = MPI_MAX;
} else if (rtype == RTYPE_MIN) {
op = MPI_MIN;
} else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
op = MPI_SUM;
} else {
ERROR("Unrecognised rtype");
}
#if AC_DOUBLE_PRECISION == 1
MPI_Datatype datatype = MPI_DOUBLE;
#else
MPI_Datatype datatype = MPI_FLOAT;
#endif
/*
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
*/
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
AcReal mpi_res;
MPI_Allreduce(&local_result, &mpi_res, 1, datatype, op, MPI_COMM_WORLD);
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.nn.x*grid.decomposition.x * grid.nn.y*grid.decomposition.y * grid.nn.z*grid.decomposition.z);
mpi_res = sqrt(inv_n * mpi_res);
}
*result = mpi_res;
return AC_SUCCESS;
}
AcResult
acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
ERRCHK(grid.initialized);
// acGridSynchronizeStream(stream);
const Device device = grid.device;
//const int3 nn = grid.nn;
acGridSynchronizeStream(STREAM_ALL);
MPI_Barrier(MPI_COMM_WORLD);
AcReal local_result;
acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result);
return acMPIReduceScal(local_result,rtype,result);
}
AcResult
acGridReduceVec(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result)
{
ERRCHK(grid.initialized);
// acGridSynchronizeStream(stream);
const Device device = grid.device;
//const int3 nn = grid.nn;
acGridSynchronizeStream(STREAM_ALL);
MPI_Barrier(MPI_COMM_WORLD);
AcReal local_result;
acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result);
return acMPIReduceScal(local_result,rtype,result);
}
#endif // AC_MPI_ENABLED