From 5a6a3110df31e8bbed2582e52e99d59774a99e6f Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 3 Dec 2019 15:14:26 +0200 Subject: [PATCH] Reformatted --- include/astaroth_defines.h | 2 -- src/core/device.cc | 20 +++++++++++--------- src/core/kernels/boundconds.cu | 3 ++- src/core/kernels/boundconds.cuh | 4 ++-- src/core/kernels/common.cuh | 13 ------------- src/core/kernels/integration.cu | 9 ++++----- src/core/kernels/integration.cuh | 10 ++++------ src/core/kernels/reductions.cu | 8 ++++---- src/core/kernels/reductions.cuh | 13 ++++++------- src/core/node.cc | 7 ++++--- 10 files changed, 37 insertions(+), 52 deletions(-) diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h index 0771d07..f48e079 100644 --- a/include/astaroth_defines.h +++ b/include/astaroth_defines.h @@ -26,7 +26,6 @@ extern "C" { #include // size_t //#include // CUDA vector types (float4, etc) -//#ifndef __CUDACC__ #if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__) #include #else @@ -46,7 +45,6 @@ typedef struct { double x, y, z; } double3; #endif -//#endif // __CUDACC__ // Library flags #define STENCIL_ORDER (6) diff --git a/src/core/device.cc b/src/core/device.cc index c33bde2..beb1687 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -26,8 +26,8 @@ */ #include "astaroth_device.h" -#include "math_utils.h" #include "errchk.h" +#include "math_utils.h" #include "kernels/common.cuh" @@ -105,8 +105,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand } // Reductions - ERRCHK_CUDA_ALWAYS( - cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad, + acVertexBufferCompdomainSizeBytes(device_config))); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal))); #if AC_MPI_ENABLED @@ -242,7 +242,7 @@ acDeviceAutoOptimize(const Device device) { cudaSetDevice(device->id); const int3 start = (int3){NGHOST, NGHOST, NGHOST}; - const int3 end = (int3){device->local_config.int_params[AC_mx], // + const int3 end = (int3){device->local_config.int_params[AC_mx], // device->local_config.int_params[AC_my], // device->local_config.int_params[AC_mz]}; return acKernelAutoOptimizeIntegration(start, end, device->vba); @@ -528,7 +528,8 @@ acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const int3 end) { cudaSetDevice(device->id); - return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]); + return acKernelPeriodicBoundconds(device->streams[stream], start, end, + device->vba.in[vtxbuf_handle]); } AcResult @@ -555,8 +556,9 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_nz_max]}; - *result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle], - device->reduce_scratchpad, device->reduce_result); + *result = acKernelReduceScal(device->streams[stream], rtype, start, end, + device->vba.in[vtxbuf_handle], device->reduce_scratchpad, + device->reduce_result); return AC_SUCCESS; } @@ -576,8 +578,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType device->local_config.int_params[AC_nz_max]}; *result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], - device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], - device->reduce_scratchpad, device->reduce_result); + device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], + device->reduce_scratchpad, device->reduce_result); return AC_SUCCESS; } diff --git a/src/core/kernels/boundconds.cu b/src/core/kernels/boundconds.cu index a001c6d..23c24a2 100644 --- a/src/core/kernels/boundconds.cu +++ b/src/core/kernels/boundconds.cu @@ -76,7 +76,8 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf) } AcResult -acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) +acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, + AcReal* vtxbuf) { const dim3 tpb(8, 2, 8); const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh index 2516000..6a43b2c 100644 --- a/src/core/kernels/boundconds.cuh +++ b/src/core/kernels/boundconds.cuh @@ -28,5 +28,5 @@ #include "astaroth.h" -AcResult -acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf); +AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, + AcReal* vtxbuf); diff --git a/src/core/kernels/common.cuh b/src/core/kernels/common.cuh index 382e9b2..c17e39b 100644 --- a/src/core/kernels/common.cuh +++ b/src/core/kernels/common.cuh @@ -72,9 +72,6 @@ DCONST(const VertexBufferHandle handle) #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset]) //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder - - - static __device__ constexpr int IDX(const int i) { @@ -93,19 +90,9 @@ IDX(const int3 idx) return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z); } - - - - - - - //#include // using namespace thrust; - - - #include #if AC_DOUBLE_PRECISION == 1 typedef cuDoubleComplex acComplex; diff --git a/src/core/kernels/integration.cu b/src/core/kernels/integration.cu index 2f029b5..dec502c 100644 --- a/src/core/kernels/integration.cu +++ b/src/core/kernels/integration.cu @@ -128,11 +128,11 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle) if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \ return; \ \ - assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \ - vertexIdx.z < DCONST(AC_nz_max)); \ + assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \ + vertexIdx.z < DCONST(AC_nz_max)); \ \ - assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \ - vertexIdx.z >= DCONST(AC_nz_min)); \ + assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \ + vertexIdx.z >= DCONST(AC_nz_min)); \ \ const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z); @@ -293,4 +293,3 @@ acKernelDummy(void) ERRCHK_CUDA_KERNEL_ALWAYS(); return AC_SUCCESS; } - diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh index b103b46..1889145 100644 --- a/src/core/kernels/integration.cuh +++ b/src/core/kernels/integration.cuh @@ -26,11 +26,9 @@ */ #pragma once -AcResult -acKernelDummy(void); +AcResult acKernelDummy(void); -AcResult -acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba); +AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba); -AcResult -acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba); +AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, + const int3 start, const int3 end, VertexBufferArray vba); diff --git a/src/core/kernels/reductions.cu b/src/core/kernels/reductions.cu index 6d497f5..71c2fac 100644 --- a/src/core/kernels/reductions.cu +++ b/src/core/kernels/reductions.cu @@ -174,7 +174,7 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks, AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, - const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) + const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) { const unsigned nx = end.x - start.x; const unsigned ny = end.y - start.y; @@ -227,9 +227,9 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i } AcReal -acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, - const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, - AcReal* reduce_result) +acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, + const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, + const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result) { const unsigned nx = end.x - start.x; const unsigned ny = end.y - start.y; diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh index 410fcdf..e095dcf 100644 --- a/src/core/kernels/reductions.cuh +++ b/src/core/kernels/reductions.cuh @@ -27,11 +27,10 @@ #pragma once #include -AcReal -acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, - const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result); +AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, + const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, + AcReal* reduce_result); -AcReal -acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, - const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, - AcReal* reduce_result); +AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, + const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1, + const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result); diff --git a/src/core/node.cc b/src/core/node.cc index 51778ca..8c0b19f 100644 --- a/src/core/node.cc +++ b/src/core/node.cc @@ -547,13 +547,14 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream, for (int i = 0; i < node->num_devices; ++i) { // OLD: ambiguous behaviour, transferred also halos between devices and assumed // that halos are in sync - //const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE - //const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z}; + // const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE + // const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z}; // New: Transfer ghost zones, but do not transfer overlapping halos. // DECOMPOSITION OFFSET HERE (d0 & d1) int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z}; - int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, NGHOST + (i + 1) * node->subgrid.n.z}; + int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, + NGHOST + (i + 1) * node->subgrid.n.z}; if (i == 0) d0.z = 0; if (i == node->num_devices - 1)