/* Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae. This file is part of Astaroth. Astaroth is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Astaroth is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Astaroth. If not, see . */ #include "astaroth_node.h" #include "astaroth_device.h" #include "errchk.h" #include "math_utils.h" // sum for reductions static const int MAX_NUM_DEVICES = 32; typedef struct { int3 m; int3 n; } Grid; struct node_s { int id; int num_devices; Device devices[MAX_NUM_DEVICES]; Grid grid; Grid subgrid; AcMeshInfo config; }; static int gridIdx(const Grid grid, const int3 idx) { return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y; } static int3 gridIdx3d(const Grid grid, const int idx) { return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x, idx / (grid.m.x * grid.m.y)}; } static void printInt3(const int3 vec) { printf("(%d, %d, %d)", vec.x, vec.y, vec.z); } static inline void print(const AcMeshInfo config) { for (int i = 0; i < NUM_INT_PARAMS; ++i) printf("[%s]: %d\n", intparam_names[i], config.int_params[i]); for (int i = 0; i < NUM_REAL_PARAMS; ++i) printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i])); } static void update_builtin_params(AcMeshInfo* config) { config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER; ///////////// PAD TEST // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE; ///////////// PAD TEST config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER; config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER; // Bounds for the computational domain, i.e. nx_min <= i < nx_max config->int_params[AC_nx_min] = NGHOST; config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx]; config->int_params[AC_ny_min] = NGHOST; config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST; config->int_params[AC_nz_min] = NGHOST; config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST; /* Additional helper params */ // Int helpers config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my]; config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny]; config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz]; } static Grid createGrid(const AcMeshInfo config) { Grid grid; grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]}; grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]}; return grid; } AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle) { struct node_s* node = (struct node_s*)malloc(sizeof(*node)); node->id = id; node->config = node_config; // Get node->num_devices ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices)); if (node->num_devices < 1) { ERROR("No CUDA devices found!"); return AC_FAILURE; } if (node->num_devices > MAX_NUM_DEVICES) { WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES"); node->num_devices = MAX_NUM_DEVICES; } if (!AC_MULTIGPU_ENABLED) { WARNING("MULTIGPU_ENABLED was false. Using only one device"); node->num_devices = 1; // Use only one device if multi-GPU is not enabled } // Check that node->num_devices is divisible with AC_nz. This makes decomposing the // problem domain to multiple GPUs much easier since we do not have to worry // about remainders ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0); // Decompose the problem domain // The main grid node->grid = createGrid(node->config); // Subgrids AcMeshInfo subgrid_config = node->config; subgrid_config.int_params[AC_nz] /= node->num_devices; update_builtin_params(&subgrid_config); #if VERBOSE_PRINTING // Defined in astaroth.h printf("###############################################################\n"); printf("Config dimensions recalculated:\n"); print(subgrid_config); printf("###############################################################\n"); #endif node->subgrid = createGrid(subgrid_config); // Periodic boundary conditions become weird if the system can "fold unto itself". ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER); ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER); ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER); #if VERBOSE_PRINTING // clang-format off printf("Grid m "); printInt3(node->grid.m); printf("\n"); printf("Grid n "); printInt3(node->grid.n); printf("\n"); printf("Subrid m "); printInt3(node->subgrid.m); printf("\n"); printf("Subrid n "); printInt3(node->subgrid.n); printf("\n"); // clang-format on #endif // Initialize the devices for (int i = 0; i < node->num_devices; ++i) { const int3 multinode_offset = (int3){0, 0, 0}; // Placeholder const int3 multigpu_offset = (int3){0, 0, i * node->subgrid.n.z}; subgrid_config.int3_params[AC_global_grid_n] = node->grid.n; subgrid_config.int3_params[AC_multigpu_offset] = multinode_offset + multigpu_offset; acDeviceCreate(i, subgrid_config, &node->devices[i]); acDevicePrintInfo(node->devices[i]); } // Enable peer access for (int i = 0; i < node->num_devices; ++i) { const int front = (i + 1) % node->num_devices; const int back = (i - 1 + node->num_devices) % node->num_devices; int can_access_front, can_access_back; cudaDeviceCanAccessPeer(&can_access_front, i, front); cudaDeviceCanAccessPeer(&can_access_back, i, back); #if VERBOSE_PRINTING printf( "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n", i, front, can_access_front, back, can_access_back); #endif cudaSetDevice(i); if (can_access_front) { ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0)); } if (can_access_back) { ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0)); } } acNodeSynchronizeStream(node, STREAM_ALL); *node_handle = node; return AC_SUCCESS; } AcResult acNodeDestroy(Node node) { acNodeSynchronizeStream(node, STREAM_ALL); for (int i = 0; i < node->num_devices; ++i) { acDeviceDestroy(node->devices[i]); } free(node); return AC_SUCCESS; } AcResult acNodePrintInfo(const Node node) { (void)node; WARNING("Not implemented"); return AC_FAILURE; } AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config) { (void)node; (void)config; WARNING("Not implemented"); return AC_FAILURE; } AcResult acNodeAutoOptimize(const Node node) { (void)node; WARNING("Not implemented"); return AC_FAILURE; } AcResult acNodeSynchronizeStream(const Node node, const Stream stream) { for (int i = 0; i < node->num_devices; ++i) { acDeviceSynchronizeStream(node->devices[i], stream); } return AC_SUCCESS; } AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle) { acNodeSynchronizeStream(node, stream); // Exchanges the halos of subgrids // After this step, the data within the main grid ranging from // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z // has been synchronized and transferred to appropriate subgrids // We loop only to node->num_devices - 1 since the front and back plate of the grid is not // transferred because their contents depend on the boundary conditions. // IMPORTANT NOTE: the boundary conditions must be applied before // callingacNodeSynchronizeStream(node, this function! I.e. the halos of subgrids must contain // up-to-date data! const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST; for (int i = 0; i < node->num_devices - 1; ++i) { // ...|ooooxxx|... -> xxx|ooooooo|... const int3 src = (int3){0, 0, node->subgrid.n.z}; const int3 dst = (int3){0, 0, 0}; const Device src_device = node->devices[i]; Device dst_device = node->devices[i + 1]; acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, num_vertices, dst_device); } for (int i = 1; i < node->num_devices; ++i) { // ...|ooooooo|xxx <- ...|xxxoooo|... const int3 src = (int3){0, 0, NGHOST}; const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z}; const Device src_device = node->devices[i]; Device dst_device = node->devices[i - 1]; acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, num_vertices, dst_device); } return AC_SUCCESS; } AcResult acNodeSynchronizeMesh(const Node node, const Stream stream) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i); } return AC_SUCCESS; } AcResult acNodeSwapBuffers(const Node node) { for (int i = 0; i < node->num_devices; ++i) { acDeviceSwapBuffers(node->devices[i]); } return AC_SUCCESS; } AcResult acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param, const AcReal value) { acNodeSynchronizeStream(node, stream); for (int i = 0; i < node->num_devices; ++i) { acDeviceLoadConstant(node->devices[i], stream, param, value); } return AC_SUCCESS; } AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh, const VertexBufferHandle vtxbuf_handle, const int3 src, const int3 dst, const int num_vertices) { acNodeSynchronizeStream(node, stream); // See the beginning of the file for an explanation of the index mapping // #pragma omp parallel for for (int i = 0; i < node->num_devices; ++i) { const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z}; const int3 s0 = src; // dst; // TODO fix (void)dst; // TODO fix const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices); const int3 da = max(s0, d0); const int3 db = min(s1, d1); /* printf("Device %d\n", i); printf("\ts0: "); printInt3(s0); printf("\n"); printf("\td0: "); printInt3(d0); printf("\n"); printf("\tda: "); printInt3(da); printf("\n"); printf("\tdb: "); printInt3(db); printf("\n"); printf("\td1: "); printInt3(d1); printf("\n"); printf("\ts1: "); printInt3(s1); printf("\n"); printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i); */ if (db.z >= da.z) { const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da); // DECOMPOSITION OFFSET HERE const int3 da_global = da; // src + da - dst; // TODO fix const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices}; // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); // printf("\n"); acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle, da_global, da_local, copy_cells); } // printf("\n"); } return AC_SUCCESS; } AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh, const int3 src, const int3 dst, const int num_vertices) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst, num_vertices); } return AC_SUCCESS; } AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh, const VertexBufferHandle vtxbuf_handle) { const int3 src = (int3){0, 0, 0}; const int3 dst = src; const size_t num_vertices = acVertexBufferSize(host_mesh.info); acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle, src, dst, num_vertices); return AC_SUCCESS; } AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acNodeLoadVertexBuffer(node, stream, host_mesh, (VertexBufferHandle)i); } return AC_SUCCESS; } AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle, const int3 src, const int3 dst, const int num_vertices, AcMesh* host_mesh) { acNodeSynchronizeStream(node, stream); for (int i = 0; i < node->num_devices; ++i) { const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z}; const int3 s0 = src; // TODO fix (void)dst; // TODO fix const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices); const int3 da = max(s0, d0); const int3 db = min(s1, d1); if (db.z >= da.z) { const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da); // DECOMPOSITION OFFSET HERE const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices}; const int3 da_global = da; // dst + da - src; // TODO fix acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local, da_global, copy_cells, host_mesh); } } return AC_SUCCESS; } AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst, const int num_vertices, AcMesh* host_mesh) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acNodeStoreVertexBufferWithOffset(node, stream, (VertexBufferHandle)i, src, dst, num_vertices, host_mesh); } return AC_SUCCESS; } AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh) { const int3 src = (int3){0, 0, 0}; const int3 dst = src; const size_t num_vertices = acVertexBufferSize(host_mesh->info); acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices, host_mesh); return AC_SUCCESS; } AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acNodeStoreVertexBuffer(node, stream, (VertexBufferHandle)i, host_mesh); } return AC_SUCCESS; } AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start, const int3 end, const AcReal dt) { acNodeSynchronizeStream(node, stream); for (int i = 0; i < node->num_devices; ++i) { // DECOMPOSITION OFFSET HERE const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z}; const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z}; const int3 da = max(start, d0); const int3 db = min(end, d1); if (db.z >= da.z) { const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z}; const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z}; acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt); } } return AC_SUCCESS; } static AcResult local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf) { acNodeSynchronizeStream(node, stream); if (node->num_devices > 1) { // Local boundary conditions for (int i = 0; i < node->num_devices; ++i) { const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z}; acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1); } } else { acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0}, node->subgrid.m); } return AC_SUCCESS; } static AcResult global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle) { acNodeSynchronizeStream(node, stream); if (node->num_devices > 1) { const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST; { // ...|ooooxxx|... -> xxx|ooooooo|... const int3 src = (int3){0, 0, node->subgrid.n.z}; const int3 dst = (int3){0, 0, 0}; const Device src_device = node->devices[node->num_devices - 1]; Device dst_device = node->devices[0]; acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, num_vertices, dst_device); } { // ...|ooooooo|xxx <- ...|xxxoooo|... const int3 src = (int3){0, 0, NGHOST}; const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z}; const Device src_device = node->devices[0]; Device dst_device = node->devices[node->num_devices - 1]; acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst, num_vertices, dst_device); } } return AC_SUCCESS; } AcResult acNodeIntegrate(const Node node, const AcReal dt) { acNodeSynchronizeStream(node, STREAM_ALL); // xxx|OOO OOOOOOOOO OOO|xxx // ^ ^ ^ ^ // n0 n1 n2 n3 const int3 n0 = (int3){NGHOST, NGHOST, NGHOST}; const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 n2 = node->grid.n; const int3 n3 = n0 + node->grid.n; for (int isubstep = 0; isubstep < 3; ++isubstep) { acNodeSynchronizeStream(node, STREAM_ALL); for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf); } acNodeSynchronizeStream(node, STREAM_ALL); // Inner inner for (int i = 0; i < node->num_devices; ++i) { const int3 m1 = n1 + (int3){0, 0, i * node->subgrid.n.z}; const int3 m2 = m1 + node->subgrid.n - (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; acNodeIntegrateSubstep(node, STREAM_16, isubstep, m1, m2, dt); } for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) { const int num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST; for (int device_id = 0; device_id < node->num_devices; ++device_id) { // ...|ooooxxx|... -> xxx|ooooooo|... { const int3 src = (int3){0, 0, node->subgrid.n.z}; const int3 dst = (int3){0, 0, 0}; acDeviceTransferVertexBufferWithOffset( node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src, dst, num_vertices, node->devices[(device_id + 1) % node->num_devices]); } // ...|ooooooo|xxx <- ...|xxxoooo|... { const int3 src = (int3){0, 0, NGHOST}; const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z}; acDeviceTransferVertexBufferWithOffset( node->devices[device_id], (Stream)vtxbuf, (VertexBufferHandle)vtxbuf, src, dst, num_vertices, node->devices[(device_id - 1 + node->num_devices) % node->num_devices]); } } } for (int vtxbuf = 0; vtxbuf < 2 * NUM_VTXBUF_HANDLES; ++vtxbuf) { acNodeSynchronizeStream(node, (Stream)vtxbuf); } // Inner outer for (int i = 0; i < node->num_devices - 1; ++i) { const int3 m1 = n1 + (int3){0, 0, (i + 1) * node->subgrid.n.z - 2 * NGHOST}; const int3 m2 = m1 + (int3){node->subgrid.n.x - 2 * NGHOST, node->subgrid.n.y - 2 * NGHOST, 2 * NGHOST}; acNodeIntegrateSubstep(node, STREAM_0, isubstep, m1, m2, dt); } // Outer // Front { const int3 m1 = (int3){n0.x, n0.y, n0.z}; const int3 m2 = (int3){n3.x, n3.y, n1.z}; acNodeIntegrateSubstep(node, STREAM_1, isubstep, m1, m2, dt); } // Back { const int3 m1 = (int3){n0.x, n0.y, n2.z}; const int3 m2 = (int3){n3.x, n3.y, n3.z}; acNodeIntegrateSubstep(node, STREAM_2, isubstep, m1, m2, dt); } // Top { const int3 m1 = (int3){n0.x, n0.y, n1.z}; const int3 m2 = (int3){n3.x, n1.y, n2.z}; acNodeIntegrateSubstep(node, STREAM_3, isubstep, m1, m2, dt); } // Bottom { const int3 m1 = (int3){n0.x, n2.y, n1.z}; const int3 m2 = (int3){n3.x, n3.y, n2.z}; acNodeIntegrateSubstep(node, STREAM_4, isubstep, m1, m2, dt); } // Left { const int3 m1 = (int3){n0.x, n1.y, n1.z}; const int3 m2 = (int3){n1.x, n2.y, n2.z}; acNodeIntegrateSubstep(node, STREAM_5, isubstep, m1, m2, dt); } // Right { const int3 m1 = (int3){n2.x, n1.y, n1.z}; const int3 m2 = (int3){n3.x, n2.y, n2.z}; acNodeIntegrateSubstep(node, STREAM_6, isubstep, m1, m2, dt); } acNodeSwapBuffers(node); } acNodeSynchronizeStream(node, STREAM_ALL); return AC_SUCCESS; } AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle) { local_boundcondstep(node, stream, vtxbuf_handle); acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle); // TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE global_boundcondstep(node, stream, vtxbuf_handle); // WARNING("Global boundconds should not be done here with multinode"); return AC_SUCCESS; } AcResult acNodePeriodicBoundconds(const Node node, const Stream stream) { for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) { acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i); } return AC_SUCCESS; } static AcReal simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results, const int& n) { AcReal res = results[0]; for (int i = 1; i < n; ++i) { if (rtype == RTYPE_MAX) { res = max(res, results[i]); } else if (rtype == RTYPE_MIN) { res = min(res, results[i]); } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) { res = sum(res, results[i]); } else { ERROR("Invalid rtype"); } } if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z); res = sqrt(inv_n * res); } return res; } AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf_handle, AcReal* result) { acNodeSynchronizeStream(node, STREAM_ALL); AcReal results[node->num_devices]; for (int i = 0; i < node->num_devices; ++i) { acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]); } *result = simple_final_reduce_scal(node, rtype, results, node->num_devices); return AC_SUCCESS; } AcResult acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c, AcReal* result) { acNodeSynchronizeStream(node, STREAM_ALL); AcReal results[node->num_devices]; for (int i = 0; i < node->num_devices; ++i) { acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]); } *result = simple_final_reduce_scal(node, rtype, results, node->num_devices); return AC_SUCCESS; }