MPI transfer for the first corner with 3D blocking now complete. Disabled/enabled some error checking for development
This commit is contained in:
@@ -47,8 +47,17 @@ struct device_s {
|
|||||||
|
|
||||||
#if AC_MPI_ENABLED
|
#if AC_MPI_ENABLED
|
||||||
// Declare memory for buffers needed for packed data transfers here
|
// Declare memory for buffers needed for packed data transfers here
|
||||||
|
#define CORNER_000 (0)
|
||||||
|
#define CORNER_100 (1)
|
||||||
|
#define CORNER_010 (2)
|
||||||
|
#define CORNER_110 (3)
|
||||||
|
#define CORNER_001 (4)
|
||||||
|
#define CORNER_101 (5)
|
||||||
|
#define CORNER_011 (6)
|
||||||
|
#define CORNER_111 (7)
|
||||||
|
#define NUM_CORNERS (8)
|
||||||
int3 corner_dims;
|
int3 corner_dims;
|
||||||
AcReal* packed_corners[NUM_VTXBUF_HANDLES][8];
|
AcReal* packed_corners[NUM_VTXBUF_HANDLES][NUM_CORNERS];
|
||||||
|
|
||||||
#define EDGE_X (0) // left to right
|
#define EDGE_X (0) // left to right
|
||||||
#define EDGE_Y (1) // bottom to top
|
#define EDGE_Y (1) // bottom to top
|
||||||
@@ -68,8 +77,9 @@ struct device_s {
|
|||||||
#define EDGE_100A (9)
|
#define EDGE_100A (9)
|
||||||
#define EDGE_010A (10)
|
#define EDGE_010A (10)
|
||||||
#define EDGE_110A (11)
|
#define EDGE_110A (11)
|
||||||
|
#define NUM_EDGES (12)
|
||||||
int3 edge_dims[3];
|
int3 edge_dims[3];
|
||||||
AcReal* packed_edges[NUM_VTXBUF_HANDLES][12];
|
AcReal* packed_edges[NUM_VTXBUF_HANDLES][NUM_EDGES];
|
||||||
|
|
||||||
#define SIDE_XY (0) // Front/back
|
#define SIDE_XY (0) // Front/back
|
||||||
#define SIDE_XZ (1) // Top/bottom
|
#define SIDE_XZ (1) // Top/bottom
|
||||||
@@ -83,8 +93,9 @@ struct device_s {
|
|||||||
|
|
||||||
#define SIDE_LEFT (4)
|
#define SIDE_LEFT (4)
|
||||||
#define SIDE_RIGHT (5)
|
#define SIDE_RIGHT (5)
|
||||||
|
#define NUM_SIDES (6)
|
||||||
int3 side_dims[3];
|
int3 side_dims[3];
|
||||||
AcReal* packed_sides[NUM_VTXBUF_HANDLES][6];
|
AcReal* packed_sides[NUM_VTXBUF_HANDLES][NUM_SIDES];
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -150,7 +161,9 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
|||||||
device->corner_dims = (int3){NGHOST, NGHOST, NGHOST};
|
device->corner_dims = (int3){NGHOST, NGHOST, NGHOST};
|
||||||
const size_t corner_bytes = device->corner_dims.x * device->corner_dims.y *
|
const size_t corner_bytes = device->corner_dims.x * device->corner_dims.y *
|
||||||
device->corner_dims.z * sizeof(AcReal);
|
device->corner_dims.z * sizeof(AcReal);
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_corners, corner_bytes));
|
|
||||||
|
for (int j = 0; j < NUM_CORNERS; ++j)
|
||||||
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_corners[i][j], corner_bytes));
|
||||||
}
|
}
|
||||||
// Edges
|
// Edges
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
@@ -167,19 +180,19 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
|||||||
device->edge_dims[EDGE_Y] = (int3){NGHOST, device_config.int_params[AC_ny], NGHOST};
|
device->edge_dims[EDGE_Y] = (int3){NGHOST, device_config.int_params[AC_ny], NGHOST};
|
||||||
const size_t edge_bytes = device->edge_dims[EDGE_Y].x * device->edge_dims[EDGE_Y].y *
|
const size_t edge_bytes = device->edge_dims[EDGE_Y].x * device->edge_dims[EDGE_Y].y *
|
||||||
device->edge_dims[EDGE_Y].z * sizeof(AcReal);
|
device->edge_dims[EDGE_Y].z * sizeof(AcReal);
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][4], edge_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_000U], edge_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][5], edge_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_100U], edge_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][6], edge_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_001U], edge_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][7], edge_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_101U], edge_bytes));
|
||||||
}
|
}
|
||||||
{ // front-back
|
{ // front-back
|
||||||
device->edge_dims[EDGE_Z] = (int3){NGHOST, NGHOST, device_config.int_params[AC_nz]};
|
device->edge_dims[EDGE_Z] = (int3){NGHOST, NGHOST, device_config.int_params[AC_nz]};
|
||||||
const size_t edge_bytes = device->edge_dims[EDGE_Z].x * device->edge_dims[EDGE_Z].y *
|
const size_t edge_bytes = device->edge_dims[EDGE_Z].x * device->edge_dims[EDGE_Z].y *
|
||||||
device->edge_dims[EDGE_Z].z * sizeof(AcReal);
|
device->edge_dims[EDGE_Z].z * sizeof(AcReal);
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][8], edge_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_000A], edge_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][9], edge_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_100A], edge_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][10], edge_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_010A], edge_bytes));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][11], edge_bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->packed_edges[i][EDGE_110A], edge_bytes));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -251,13 +264,13 @@ acDeviceDestroy(Device device)
|
|||||||
#if AC_MPI_ENABLED
|
#if AC_MPI_ENABLED
|
||||||
// Free data required for packed tranfers here (cudaFree)
|
// Free data required for packed tranfers here (cudaFree)
|
||||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
for (int j = 0; j < 8; ++j)
|
for (int j = 0; j < NUM_CORNERS; ++j)
|
||||||
cudaFree(device->packed_corners[i][j]);
|
cudaFree(device->packed_corners[i][j]);
|
||||||
|
|
||||||
for (int j = 0; j < 12; ++j)
|
for (int j = 0; j < NUM_EDGES; ++j)
|
||||||
cudaFree(device->packed_edges[i][j]);
|
cudaFree(device->packed_edges[i][j]);
|
||||||
|
|
||||||
for (int j = 0; j < 6; ++j)
|
for (int j = 0; j < NUM_SIDES; ++j)
|
||||||
cudaFree(device->packed_sides[i][j]);
|
cudaFree(device->packed_sides[i][j]);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@@ -1214,11 +1227,92 @@ acDeviceGatherMeshMPI(const AcMesh src, const int3 decomposition, AcMesh* dst)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "kernels/packing.cuh"
|
||||||
|
|
||||||
static AcResult
|
static AcResult
|
||||||
acDeviceCommunicateHalos(const Device device)
|
acDeviceCommunicateHalos(const Device device)
|
||||||
{
|
{
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL);
|
acDeviceSynchronizeStream(device, STREAM_ALL);
|
||||||
|
|
||||||
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
|
if (sizeof(AcReal) == 8)
|
||||||
|
datatype = MPI_DOUBLE;
|
||||||
|
|
||||||
|
MPI_Request send_corner_req[NUM_VTXBUF_HANDLES][NUM_CORNERS];
|
||||||
|
MPI_Request recv_corner_req[NUM_VTXBUF_HANDLES][NUM_CORNERS];
|
||||||
|
|
||||||
|
int pid, nprocs;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
const int3 decomposition = decompose(nprocs);
|
||||||
|
int3 pid3d = getPid3D(pid, decomposition);
|
||||||
|
|
||||||
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
|
// Recv corners
|
||||||
|
{
|
||||||
|
/// From 000
|
||||||
|
const int recv_pid = getPid1D(pid3d + (int3){1, 1, 1}, decomposition);
|
||||||
|
const size_t count = device->corner_dims.x * //
|
||||||
|
device->corner_dims.y * //
|
||||||
|
device->corner_dims.z;
|
||||||
|
MPI_Irecv(&device->packed_corners[i][CORNER_111], count, datatype, recv_pid, CORNER_000,
|
||||||
|
MPI_COMM_WORLD, &(recv_corner_req[i][CORNER_111]));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
/// 100
|
||||||
|
} {
|
||||||
|
/// 010
|
||||||
|
} { /// 110
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
|
// Pack data and send corners
|
||||||
|
{ /// 000
|
||||||
|
const int send_pid = getPid1D(pid3d + (int3){-1, -1, -1}, decomposition);
|
||||||
|
const size_t count = device->corner_dims.x * //
|
||||||
|
device->corner_dims.y * //
|
||||||
|
device->corner_dims.z;
|
||||||
|
acKernelPackData(device->streams[STREAM_DEFAULT], device->vba.in[i],
|
||||||
|
(int3){NGHOST, NGHOST, NGHOST}, device->corner_dims,
|
||||||
|
device->packed_corners[i][CORNER_000]);
|
||||||
|
MPI_Isend(&device->packed_corners[i][CORNER_000], count, datatype, send_pid, CORNER_000,
|
||||||
|
MPI_COMM_WORLD, &(send_corner_req[i][CORNER_000]));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
/// 100
|
||||||
|
} {
|
||||||
|
/// 010
|
||||||
|
} { /// 110
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||||
|
// Unpack data
|
||||||
|
{
|
||||||
|
/// 000
|
||||||
|
const int nx = device->local_config.int_params[AC_nx];
|
||||||
|
const int ny = device->local_config.int_params[AC_ny];
|
||||||
|
const int nz = device->local_config.int_params[AC_nz];
|
||||||
|
MPI_Wait(&(recv_corner_req[i][CORNER_111]), MPI_STATUS_IGNORE);
|
||||||
|
acKernelUnpackData(device->streams[STREAM_DEFAULT],
|
||||||
|
device->packed_corners[i][CORNER_111], device->corner_dims,
|
||||||
|
(int3){nx, ny, nz}, device->vba.in[i]);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
/// 100
|
||||||
|
} {
|
||||||
|
/// 010
|
||||||
|
} { /// 110
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||||
|
// MPI_Waitall(NUM_CORNERS, send_corner_req[i], MPI_STATUSES_IGNORE);
|
||||||
|
|
||||||
|
// for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
|
||||||
|
// MPI_Wait(&(send_corner_req[i][CORNER_000]), MPI_STATUS_IGNORE);
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
WARNING("acDeviceCommunicateHalos not yet implemented. Tests will fail (bounds must be "
|
WARNING("acDeviceCommunicateHalos not yet implemented. Tests will fail (bounds must be "
|
||||||
"up-to-date before calling acDeviceGatherMeshMPI)");
|
"up-to-date before calling acDeviceGatherMeshMPI)");
|
||||||
@@ -1334,7 +1428,7 @@ acDeviceRunMPITest(void)
|
|||||||
// Attempt to enable peer access with all neighbors in the node
|
// Attempt to enable peer access with all neighbors in the node
|
||||||
for (int i = 0; i < devices_per_node; ++i) {
|
for (int i = 0; i < devices_per_node; ++i) {
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(i, 0));
|
// WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(i, 0)); // TODO RE-ENABLE
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
// Attempt to enable peer access to the most expensive neighbors (sides)
|
// Attempt to enable peer access to the most expensive neighbors (sides)
|
||||||
|
@@ -27,6 +27,7 @@
|
|||||||
#include "packing.cuh"
|
#include "packing.cuh"
|
||||||
|
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
#include "src/core/errchk.h"
|
||||||
|
|
||||||
__global__ void
|
__global__ void
|
||||||
kernel_pack_data(const AcReal* unpacked, const int3 unpacked_start, const int3 packed_dimensions,
|
kernel_pack_data(const AcReal* unpacked, const int3 unpacked_start, const int3 packed_dimensions,
|
||||||
@@ -49,7 +50,8 @@ kernel_pack_data(const AcReal* unpacked, const int3 unpacked_start, const int3 p
|
|||||||
const int k_unpacked = k_packed + unpacked_start.z;
|
const int k_unpacked = k_packed + unpacked_start.z;
|
||||||
|
|
||||||
const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
|
const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
|
||||||
const int packed_idx = i_packed + j_packed * packed_dimensions.x +
|
const int packed_idx = i_packed + //
|
||||||
|
j_packed * packed_dimensions.x + //
|
||||||
k_packed * packed_dimensions.x * packed_dimensions.y;
|
k_packed * packed_dimensions.x * packed_dimensions.y;
|
||||||
|
|
||||||
packed[packed_idx] = unpacked[unpacked_idx];
|
packed[packed_idx] = unpacked[unpacked_idx];
|
||||||
@@ -76,13 +78,14 @@ kernel_unpack_data(const AcReal* packed, const int3 packed_dimensions, const int
|
|||||||
const int k_unpacked = k_packed + unpacked_start.z;
|
const int k_unpacked = k_packed + unpacked_start.z;
|
||||||
|
|
||||||
const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
|
const int unpacked_idx = DEVICE_VTXBUF_IDX(i_unpacked, j_unpacked, k_unpacked);
|
||||||
const int packed_idx = i_packed + j_packed * packed_dimensions.x +
|
const int packed_idx = i_packed + //
|
||||||
|
j_packed * packed_dimensions.x + //
|
||||||
k_packed * packed_dimensions.x * packed_dimensions.y;
|
k_packed * packed_dimensions.x * packed_dimensions.y;
|
||||||
|
|
||||||
unpacked[unpacked_idx] = packed[packed_idx];
|
unpacked[unpacked_idx] = packed[packed_idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
static AcResult
|
AcResult
|
||||||
acKernelPackData(const cudaStream_t stream, const AcReal* unpacked, const int3 unpacked_start,
|
acKernelPackData(const cudaStream_t stream, const AcReal* unpacked, const int3 unpacked_start,
|
||||||
const int3 packed_dimensions, AcReal* packed)
|
const int3 packed_dimensions, AcReal* packed)
|
||||||
{
|
{
|
||||||
@@ -92,11 +95,12 @@ acKernelPackData(const cudaStream_t stream, const AcReal* unpacked, const int3 u
|
|||||||
(unsigned int)ceil(packed_dimensions.z / (float)tpb.z));
|
(unsigned int)ceil(packed_dimensions.z / (float)tpb.z));
|
||||||
|
|
||||||
kernel_pack_data<<<bpg, tpb, 0, stream>>>(unpacked, unpacked_start, packed_dimensions, packed);
|
kernel_pack_data<<<bpg, tpb, 0, stream>>>(unpacked, unpacked_start, packed_dimensions, packed);
|
||||||
|
ERRCHK_CUDA_KERNEL_ALWAYS(); // TODO SET W/ DEBUG ONLY
|
||||||
|
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AcResult
|
AcResult
|
||||||
acKernelUnpackData(const cudaStream_t stream, const AcReal* packed, const int3 packed_dimensions,
|
acKernelUnpackData(const cudaStream_t stream, const AcReal* packed, const int3 packed_dimensions,
|
||||||
const int3 unpacked_start, AcReal* unpacked)
|
const int3 unpacked_start, AcReal* unpacked)
|
||||||
{
|
{
|
||||||
@@ -107,7 +111,7 @@ acKernelUnpackData(const cudaStream_t stream, const AcReal* packed, const int3 p
|
|||||||
|
|
||||||
kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, packed_dimensions, unpacked_start,
|
kernel_unpack_data<<<bpg, tpb, 0, stream>>>(packed, packed_dimensions, unpacked_start,
|
||||||
unpacked);
|
unpacked);
|
||||||
|
ERRCHK_CUDA_KERNEL_ALWAYS(); // TODO SET W/ DEBUG ONLY
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -27,6 +27,13 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include "astaroth.h"
|
#include "astaroth.h"
|
||||||
|
|
||||||
|
AcResult acKernelPackData(const cudaStream_t stream, const AcReal* unpacked,
|
||||||
|
const int3 unpacked_start, const int3 packed_dimensions, AcReal* packed);
|
||||||
|
|
||||||
|
AcResult acKernelUnpackData(const cudaStream_t stream, const AcReal* packed,
|
||||||
|
const int3 packed_dimensions, const int3 unpacked_start,
|
||||||
|
AcReal* unpacked);
|
||||||
|
|
||||||
AcResult acKernelPackCorner(void);
|
AcResult acKernelPackCorner(void);
|
||||||
AcResult acKernelUnpackCorner(void);
|
AcResult acKernelUnpackCorner(void);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user