Fixed ambiguous logic in acNodeStoreVertexBufferWithOffset, now halos of arbitrary GPUs do not overwrite valid data from the computational domain of a neighboring GPU. Also disabled p2p transfers temporarily until I figure out a clean way to avoid cudaErrorPeerAccessAlreadyEnabled errors

This commit is contained in:
jpekkila
2019-12-02 12:58:09 +02:00
parent 0178d4788c
commit 8bffb2a1d0

View File

@@ -275,6 +275,7 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
acDevicePrintInfo(node->devices[i]); acDevicePrintInfo(node->devices[i]);
} }
/*
// Enable peer access // Enable peer access
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { for (int i = 0; i < node->num_devices; ++i) {
@@ -292,12 +293,13 @@ acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
cudaSetDevice(i); cudaSetDevice(i);
if (can_access_front) { if (can_access_front) {
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0)); WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
} }
if (can_access_back) { if (can_access_back) {
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0)); WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
} }
} }
*/
acNodeSynchronizeStream(node, STREAM_ALL); acNodeSynchronizeStream(node, STREAM_ALL);
*node_handle = node; *node_handle = node;
@@ -308,7 +310,7 @@ AcResult
acNodeDestroy(Node node) acNodeDestroy(Node node)
{ {
acNodeSynchronizeStream(node, STREAM_ALL); acNodeSynchronizeStream(node, STREAM_ALL);
/*
// Disable peer access // Disable peer access
for (int i = 0; i < node->num_devices; ++i) { for (int i = 0; i < node->num_devices; ++i) {
const int front = (i + 1) % node->num_devices; const int front = (i + 1) % node->num_devices;
@@ -325,12 +327,13 @@ acNodeDestroy(Node node)
cudaSetDevice(i); cudaSetDevice(i);
if (can_access_front) { if (can_access_front) {
ERRCHK_CUDA_ALWAYS(cudaDeviceDisablePeerAccess(front)); WARNCHK_CUDA_ALWAYS(cudaDeviceDisablePeerAccess(front));
} }
if (can_access_back) { if (can_access_back) {
ERRCHK_CUDA_ALWAYS(cudaDeviceDisablePeerAccess(back)); WARNCHK_CUDA_ALWAYS(cudaDeviceDisablePeerAccess(back));
} }
} }
*/
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { for (int i = 0; i < node->num_devices; ++i) {
@@ -542,8 +545,19 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
acNodeSynchronizeStream(node, stream); acNodeSynchronizeStream(node, stream);
// #pragma omp parallel for // #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { for (int i = 0; i < node->num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE // OLD: ambiguous behaviour, transferred also halos between devices and assumed
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z}; // that halos are in sync
//const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
//const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
// New: Transfer ghost zones, but do not transfer overlapping halos.
// DECOMPOSITION OFFSET HERE (d0 & d1)
int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z};
int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, NGHOST + (i + 1) * node->subgrid.n.z};
if (i == 0)
d0.z = 0;
if (i == node->num_devices - 1)
d1.z = NGHOST + (i + 1) * node->subgrid.n.z + NGHOST;
const int3 s0 = src; // TODO fix const int3 s0 = src; // TODO fix
(void)dst; // TODO fix (void)dst; // TODO fix