diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc index 3a38240..1c8d0f7 100644 --- a/src/core/astaroth.cc +++ b/src/core/astaroth.cc @@ -126,6 +126,15 @@ acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexB return result; } +AcReal +acReduceVecScal(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b, + const VertexBufferHandle c, const VertexBufferHandle d) +{ + AcReal result; + acNodeReduceVecScal(nodes[0], STREAM_DEFAULT, rtype, a, b, c, d, &result); + return result; +} + AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh) { diff --git a/src/core/device.cc b/src/core/device.cc index 3e5515a..a6ec793 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -474,6 +474,28 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType return AC_SUCCESS; } +AcResult +acDeviceReduceVecScal(const Device device, const Stream stream, const ReductionType rtype, + const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1, + const VertexBufferHandle vtxbuf2, const VertexBufferHandle vtxbuf3, AcReal* result) +{ + cudaSetDevice(device->id); + + const int3 start = (int3){device->local_config.int_params[AC_nx_min], + device->local_config.int_params[AC_ny_min], + device->local_config.int_params[AC_nz_min]}; + + const int3 end = (int3){device->local_config.int_params[AC_nx_max], + device->local_config.int_params[AC_ny_max], + device->local_config.int_params[AC_nz_max]}; + + *result = acKernelReduceVecScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], + device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], device->vba.in[vtxbuf3], + device->reduce_scratchpad, device->reduce_result); + return AC_SUCCESS; +} + + #if AC_MPI_ENABLED /** Quick overview of the MPI implementation: @@ -2047,4 +2069,7 @@ acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBuff return acMPIReduceScal(local_result, rtype, result); } + +//MV: for MPI we will need acGridReduceVecScal() to get Alfven speeds etc. TODO + #endif // AC_MPI_ENABLED diff --git a/src/core/node.cc b/src/core/node.cc index 7a0ca03..70c3115 100644 --- a/src/core/node.cc +++ b/src/core/node.cc @@ -797,13 +797,13 @@ simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcRe { AcReal res = results[0]; for (int i = 1; i < n; ++i) { - if (rtype == RTYPE_MAX) { + if (rtype == RTYPE_MAX || rtype == RTYPE_ALFVEN_MAX) { res = max(res, results[i]); } - else if (rtype == RTYPE_MIN) { + else if (rtype == RTYPE_MIN || rtype == RTYPE_ALFVEN_MIN) { res = min(res, results[i]); } - else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) { + else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM || rtype == RTYPE_ALFVEN_RMS) { res = sum(res, results[i]); } else { @@ -811,7 +811,7 @@ simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcRe } } - if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) { + if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_ALFVEN_RMS) { const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z); res = sqrt(inv_n * res); } @@ -850,3 +850,21 @@ acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype, *result = simple_final_reduce_scal(node, rtype, results, node->num_devices); return AC_SUCCESS; } + +AcResult +acNodeReduceVecScal(const Node node, const Stream stream, const ReductionType rtype, + const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c, + const VertexBufferHandle d, AcReal* result) +{ + acNodeSynchronizeStream(node, STREAM_ALL); + + AcReal results[node->num_devices]; + // #pragma omp parallel for + for (int i = 0; i < node->num_devices; ++i) { + acDeviceReduceVecScal(node->devices[i], stream, rtype, a, b, c, d, &results[i]); + } + + *result = simple_final_reduce_scal(node, rtype, results, node->num_devices); + return AC_SUCCESS; +} +