From 5a6a3110df31e8bbed2582e52e99d59774a99e6f Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 3 Dec 2019 15:14:26 +0200
Subject: [PATCH] Reformatted

---
 include/astaroth_defines.h       |  2 --
 src/core/device.cc               | 20 +++++++++++---------
 src/core/kernels/boundconds.cu   |  3 ++-
 src/core/kernels/boundconds.cuh  |  4 ++--
 src/core/kernels/common.cuh      | 13 -------------
 src/core/kernels/integration.cu  |  9 ++++-----
 src/core/kernels/integration.cuh | 10 ++++------
 src/core/kernels/reductions.cu   |  8 ++++----
 src/core/kernels/reductions.cuh  | 13 ++++++-------
 src/core/node.cc                 |  7 ++++---
 10 files changed, 37 insertions(+), 52 deletions(-)

diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 0771d07..f48e079 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -26,7 +26,6 @@ extern "C" {
 #include <stdlib.h> // size_t
 //#include <vector_types.h> // CUDA vector types (float4, etc)
 
-//#ifndef __CUDACC__
 #if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
 #include <cuda_runtime_api.h>
 #else
@@ -46,7 +45,6 @@ typedef struct {
     double x, y, z;
 } double3;
 #endif
-//#endif // __CUDACC__
 
 // Library flags
 #define STENCIL_ORDER (6)
diff --git a/src/core/device.cc b/src/core/device.cc
index c33bde2..beb1687 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -26,8 +26,8 @@
  */
 #include "astaroth_device.h"
 
-#include "math_utils.h"
 #include "errchk.h"
+#include "math_utils.h"
 
 #include "kernels/common.cuh"
 
@@ -105,8 +105,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
     }
 
     // Reductions
-    ERRCHK_CUDA_ALWAYS(
-        cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad,
+                                  acVertexBufferCompdomainSizeBytes(device_config)));
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
 
 #if AC_MPI_ENABLED
@@ -242,7 +242,7 @@ acDeviceAutoOptimize(const Device device)
 {
     cudaSetDevice(device->id);
     const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end = (int3){device->local_config.int_params[AC_mx], //
+    const int3 end   = (int3){device->local_config.int_params[AC_mx], //
                             device->local_config.int_params[AC_my], //
                             device->local_config.int_params[AC_mz]};
     return acKernelAutoOptimizeIntegration(start, end, device->vba);
@@ -528,7 +528,8 @@ acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
                               const int3 end)
 {
     cudaSetDevice(device->id);
-    return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]);
+    return acKernelPeriodicBoundconds(device->streams[stream], start, end,
+                                      device->vba.in[vtxbuf_handle]);
 }
 
 AcResult
@@ -555,8 +556,9 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType
                             device->local_config.int_params[AC_ny_max],
                             device->local_config.int_params[AC_nz_max]};
 
-    *result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
-                          device->reduce_scratchpad, device->reduce_result);
+    *result = acKernelReduceScal(device->streams[stream], rtype, start, end,
+                                 device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
+                                 device->reduce_result);
     return AC_SUCCESS;
 }
 
@@ -576,8 +578,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
                             device->local_config.int_params[AC_nz_max]};
 
     *result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
-                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
-                         device->reduce_scratchpad, device->reduce_result);
+                                device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
+                                device->reduce_scratchpad, device->reduce_result);
     return AC_SUCCESS;
 }
 
diff --git a/src/core/kernels/boundconds.cu b/src/core/kernels/boundconds.cu
index a001c6d..23c24a2 100644
--- a/src/core/kernels/boundconds.cu
+++ b/src/core/kernels/boundconds.cu
@@ -76,7 +76,8 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
 }
 
 AcResult
-acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
+acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
+                           AcReal* vtxbuf)
 {
     const dim3 tpb(8, 2, 8);
     const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
diff --git a/src/core/kernels/boundconds.cuh b/src/core/kernels/boundconds.cuh
index 2516000..6a43b2c 100644
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -28,5 +28,5 @@
 
 #include "astaroth.h"
 
-AcResult
-acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf);
+AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
+                                    AcReal* vtxbuf);
diff --git a/src/core/kernels/common.cuh b/src/core/kernels/common.cuh
index 382e9b2..c17e39b 100644
--- a/src/core/kernels/common.cuh
+++ b/src/core/kernels/common.cuh
@@ -72,9 +72,6 @@ DCONST(const VertexBufferHandle handle)
 #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
 
-
-
-
 static __device__ constexpr int
 IDX(const int i)
 {
@@ -93,19 +90,9 @@ IDX(const int3 idx)
     return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
 }
 
-
-
-
-
-
-
-
 //#include <thrust/complex.h>
 // using namespace thrust;
 
-
-
-
 #include <cuComplex.h>
 #if AC_DOUBLE_PRECISION == 1
 typedef cuDoubleComplex acComplex;
diff --git a/src/core/kernels/integration.cu b/src/core/kernels/integration.cu
index 2f029b5..dec502c 100644
--- a/src/core/kernels/integration.cu
+++ b/src/core/kernels/integration.cu
@@ -128,11 +128,11 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
     if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
         return;                                                                                    \
                                                                                                    \
-    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&           \
-           vertexIdx.z < DCONST(AC_nz_max));                                                   \
+    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&                   \
+           vertexIdx.z < DCONST(AC_nz_max));                                                       \
                                                                                                    \
-    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&         \
-           vertexIdx.z >= DCONST(AC_nz_min));                                                  \
+    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&                 \
+           vertexIdx.z >= DCONST(AC_nz_min));                                                      \
                                                                                                    \
     const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
 
@@ -293,4 +293,3 @@ acKernelDummy(void)
     ERRCHK_CUDA_KERNEL_ALWAYS();
     return AC_SUCCESS;
 }
-
diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh
index b103b46..1889145 100644
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -26,11 +26,9 @@
  */
 #pragma once
 
-AcResult
-acKernelDummy(void);
+AcResult acKernelDummy(void);
 
-AcResult
-acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
+AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
 
-AcResult
-acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba);
+AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
+                                  const int3 start, const int3 end, VertexBufferArray vba);
diff --git a/src/core/kernels/reductions.cu b/src/core/kernels/reductions.cu
index 6d497f5..71c2fac 100644
--- a/src/core/kernels/reductions.cu
+++ b/src/core/kernels/reductions.cu
@@ -174,7 +174,7 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
 
 AcReal
 acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
+                   const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
 {
     const unsigned nx        = end.x - start.x;
     const unsigned ny        = end.y - start.y;
@@ -227,9 +227,9 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i
 }
 
 AcReal
-acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
-           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
-           AcReal* reduce_result)
+acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+                  const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
+                  const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
 {
     const unsigned nx        = end.x - start.x;
     const unsigned ny        = end.y - start.y;
diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh
index 410fcdf..e095dcf 100644
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -27,11 +27,10 @@
 #pragma once
 #include <astaroth.h>
 
-AcReal
-acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result);
+AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+                          const int3& end, const AcReal* vtxbuf, AcReal* scratchpad,
+                          AcReal* reduce_result);
 
-AcReal
-acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
-           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
-           AcReal* reduce_result);
+AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+                         const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
+                         const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
diff --git a/src/core/node.cc b/src/core/node.cc
index 51778ca..8c0b19f 100644
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -547,13 +547,14 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
     for (int i = 0; i < node->num_devices; ++i) {
         // OLD: ambiguous behaviour, transferred also halos between devices and assumed
         // that halos are in sync
-        //const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        //const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
+        // const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        // const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
 
         // New: Transfer ghost zones, but do not transfer overlapping halos.
         // DECOMPOSITION OFFSET HERE (d0 & d1)
         int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z};
-        int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, NGHOST + (i + 1) * node->subgrid.n.z};
+        int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y,
+                         NGHOST + (i + 1) * node->subgrid.n.z};
         if (i == 0)
             d0.z = 0;
         if (i == node->num_devices - 1)