From 0d1c5b3911f502c3fe8a7d91b344a81921a2d822 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 24 Jun 2020 15:56:30 +0300
Subject: [PATCH] Autoformatted

---
 samples/bwtest/main.c           | 47 ++++++++++++--------
 src/core/device.cc              | 21 ++++-----
 src/core/kernels/kernels.cu     | 25 ++++++-----
 src/core/kernels/reductions.cuh |  5 ++-
 src/utils/modelsolver.c         | 78 ++++++++++++++++++++++-----------
 5 files changed, 108 insertions(+), 68 deletions(-)

diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c
index 9dd25d9..35d98d5 100644
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -6,15 +6,21 @@
 
 #include <mpi.h>
 
-#include <cuda_runtime_api.h>
 #include <cuda.h> // CUDA driver API
+#include <cuda_runtime_api.h>
 
 #include "timer_hires.h" // From src/common
 
 //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
 #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
 
-#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
+#define errchk(x)                                                                                  \
+    {                                                                                              \
+        if (!(x)) {                                                                                \
+            fprintf(stderr, "errchk(%s) failed", #x);                                              \
+            assert(x);                                                                             \
+        }                                                                                          \
+    }
 
 /*
   Findings:
@@ -57,17 +63,18 @@ allocDevice(const size_t bytes)
 static uint8_t*
 allocDevicePinned(const size_t bytes)
 {
-    #define USE_CUDA_DRIVER_PINNING (1)
-    #if USE_CUDA_DRIVER_PINNING
+#define USE_CUDA_DRIVER_PINNING (1)
+#if USE_CUDA_DRIVER_PINNING
     uint8_t* arr = allocDevice(bytes);
 
     unsigned int flag = 1;
-    CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr);
+    CUresult retval   = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                            (CUdeviceptr)arr);
 
     errchk(retval == CUDA_SUCCESS);
     return arr;
 
-    #else
+#else
     uint8_t* arr;
     // Standard (20 GiB/s internode, 85 GiB/s intranode)
     // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
@@ -77,7 +84,7 @@ allocDevicePinned(const size_t bytes)
     const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
     errchk(retval == cudaSuccess);
     return arr;
-    #endif
+#endif
 }
 
 /*
@@ -267,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst)
     cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
 }
 
-
 static void
 sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
 {
@@ -327,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
     MPI_Barrier(MPI_COMM_WORLD);
 }
 
-
 static void
-measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
-                                                                                                            uint8_t* hsrc, uint8_t* ddst)
+measurebw2(const char* msg, const size_t bytes,
+           void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
+           uint8_t* hsrc, uint8_t* ddst)
 {
     const size_t num_samples = 100;
 
@@ -414,8 +420,8 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Host)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Host)", //
-                  2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        //measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
         //          2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
 
         freeHost(src);
@@ -434,11 +440,12 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Device)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
-        //measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
         //          2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
+                  dst);
 
         freeDevice(src);
         freeDevice(dst);
@@ -456,7 +463,7 @@ main(void)
         measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
                   2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
         measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
-                  2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
 
         freeDevice(src);
         freeDevice(dst);
@@ -472,7 +479,8 @@ main(void)
         measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
         measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
 
-        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
+                   ddst);
 
         freeDevice(dsrc);
         freeDevice(ddst);
@@ -490,7 +498,8 @@ main(void)
         measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
         measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
 
-        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
+        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
+                   hsrc, ddst);
 
         freeDevice(dsrc);
         freeDevice(ddst);
diff --git a/src/core/device.cc b/src/core/device.cc
index 181e802..8cda677 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -16,7 +16,7 @@
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
 #define MPI_INCL_CORNERS (0)
-#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory
+#define MPI_USE_PINNED (1)              // Do inter-node comm with pinned memory
 #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
 
 #include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
@@ -656,17 +656,18 @@ acCreatePackedData(const int3 dims)
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
     ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
 
-    #if MPI_USE_CUDA_DRIVER_PINNING
-      ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
+#if MPI_USE_CUDA_DRIVER_PINNING
+    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
 
-      unsigned int flag = 1;
-      CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned);
-      ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
-    #else
+    unsigned int flag = 1;
+    CUresult retval   = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                            (CUdeviceptr)data.data_pinned);
+    ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
+#else
     ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
-    // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
-    // slower than pinned (38 ms vs. 125 ms)
-    #endif // USE_CUDA_DRIVER_PINNING
+// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
+// slower than pinned (38 ms vs. 125 ms)
+#endif // USE_CUDA_DRIVER_PINNING
 
     return data;
 }
diff --git a/src/core/kernels/kernels.cu b/src/core/kernels/kernels.cu
index 3c59486..2878f38 100644
--- a/src/core/kernels/kernels.cu
+++ b/src/core/kernels/kernels.cu
@@ -75,17 +75,20 @@ exp(const acComplex& val)
 {
     return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
 }
-static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+static __device__ inline acComplex
+operator*(const AcReal& a, const acComplex& b)
 {
     return (acComplex){a * b.x, a * b.y};
 }
 
-static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
+static __device__ inline acComplex
+operator*(const acComplex& b, const AcReal& a)
 {
     return (acComplex){a * b.x, a * b.y};
 }
 
-static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+static __device__ inline acComplex
+operator*(const acComplex& a, const acComplex& b)
 {
     return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
 }
@@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal
 
     const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal
 
     const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara
 
     const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
     ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
     return AC_SUCCESS;
 }
 
@@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
     }
 
     if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
-        fprintf(
-            stderr,
-            "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n",
-            value.x, value.y, value.z, int3param_names[param]);
+        fprintf(stderr,
+                "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. "
+                "Skipping.\n",
+                value.x, value.y, value.z, int3param_names[param]);
         return AC_FAILURE;
     }
 
@@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device)
 {
     cudaSetDevice(device->id);
 
-    // clang-format off
+// clang-format off
     // Scalar
     #define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE);
     AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM)
diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh
index 8877e7e..1f40df3 100644
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
     assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
     assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
 
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
-        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)],
+                                                                   src1[IDX(src_idx)],
+                                                                   src2[IDX(src_idx)]);
 }
 
 template <ReduceFunc reduce>
diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c
index d9446eb..4c2edcf 100644
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -103,11 +103,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds)
 #elif STENCIL_ORDER == 4
     const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
 #elif STENCIL_ORDER == 6
-    const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0),
-                                   (Scalar)(1.0 / 60.0)};
+    const Scalar coefficients[] = {
+        0,
+        (Scalar)(3.0 / 4.0),
+        (Scalar)(-3.0 / 20.0),
+        (Scalar)(1.0 / 60.0),
+    };
 #elif STENCIL_ORDER == 8
-    const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0),
-                                   (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)};
+    const Scalar coefficients[] = {
+        0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0),
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -126,15 +131,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds)
 #if STENCIL_ORDER == 2
     const Scalar coefficients[] = {-2, 1};
 #elif STENCIL_ORDER == 4
-    const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0),
-                                   (Scalar)(-1.0 / 12.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-5.0 / 2.0),
+        (Scalar)(4.0 / 3.0),
+        (Scalar)(-1.0 / 12.0),
+    };
 #elif STENCIL_ORDER == 6
-    const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0),
-                                   (Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-49.0 / 18.0),
+        (Scalar)(3.0 / 2.0),
+        (Scalar)(-3.0 / 20.0),
+        (Scalar)(1.0 / 90.0),
+    };
 #elif STENCIL_ORDER == 8
-    const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),
-                                   (Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0),
-                                   (Scalar)(-1.0 / 560.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),    (Scalar)(-1.0 / 5.0),
+        (Scalar)(8.0 / 315.0),   (Scalar)(-1.0 / 560.0),
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -156,16 +169,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in
     const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
 #elif STENCIL_ORDER == 4
     const Scalar coefficients[] = {
-        0, (Scalar)(1.0 / 32.0),
-        (Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders
+        0,
+        (Scalar)(1.0 / 32.0),
+        (Scalar)(1.0 / 64.0),
+    }; // TODO correct coefficients, these are just placeholders
 #elif STENCIL_ORDER == 6
     const Scalar fac            = ((Scalar)(1. / 720.));
-    const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac,
-                                   (Scalar)(2.0) * fac};
+    const Scalar coefficients[] = {
+        0 * fac,
+        (Scalar)(270.0) * fac,
+        (Scalar)(-27.0) * fac,
+        (Scalar)(2.0) * fac,
+    };
 #elif STENCIL_ORDER == 8
     const Scalar fac            = ((Scalar)(1. / 20160.));
-    const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac,
-                                   (Scalar)(128.) * fac, (Scalar)(-9.) * fac};
+    const Scalar coefficients[] = {
+        0 * fac,
+        (Scalar)(8064.) * fac,
+        (Scalar)(-1008.) * fac,
+        (Scalar)(128.) * fac,
+        (Scalar)(-9.) * fac,
+    };
 #endif
 
 #define MID (STENCIL_ORDER / 2)
@@ -207,14 +231,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr)
     Scalar pencil_a[STENCIL_ORDER + 1];
     //#pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2,
-                                   k)];
+        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
+                                   j + offset - STENCIL_ORDER / 2, k)];
 
     Scalar pencil_b[STENCIL_ORDER + 1];
     //#pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset,
-                                   k)];
+        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
+                                   j + STENCIL_ORDER / 2 - offset, k)];
 
     return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
 }
@@ -539,7 +563,8 @@ gradient_of_divergence(const VectorData vec)
     return (Vector){
         hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
         hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
-        hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]};
+        hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2],
+    };
 }
 
 // Takes uu gradients and returns S
@@ -805,10 +830,11 @@ forcing(int3 globalVertexIdx, Scalar dt)
                                        getInt(AC_ny) * getReal(AC_dsy),
                                        getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
     (void)a;                                                             // WARNING: not used
-    Vector xx        = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
-                         (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
-                         (globalVertexIdx.z - getInt(AC_nz_min)) *
-                             getReal(AC_dsz)}; // sink (current index)
+    Vector xx = (Vector){
+        (globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
+        (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
+        (globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz),
+    }; // sink (current index)
     const Scalar cs2 = getReal(AC_cs2_sound);
     const Scalar cs  = sqrt(cs2);