Autoformatted all CUDA/C/C++ code

2019-06-18 16:42:56 +03:00
parent 6fdc4cddb2
commit 8864266042
12 changed files with 1053 additions and 1111 deletions
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -316,9 +316,13 @@ traverse(const ASTNode* node)
            if (symbol_table[i].type_qualifier == IN) {
                printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier),
                       symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
-            } else if (symbol_table[i].type_qualifier == OUT) {
-                printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
-                //printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+            }
+            else if (symbol_table[i].type_qualifier == OUT) {
+                printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier),
+                       symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+                // printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n",
+                // translate(symbol_table[i].type_specifier), symbol_table[i].identifier,
+                // inout_name_prefix, symbol_table[i].identifier);
            }
        }
    }
@@ -326,8 +330,7 @@ traverse(const ASTNode* node)
    // Preprocessed parameter boilerplate
    if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
        inside_preprocessed = true;
-    static const char
-        preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
+    static const char preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
    if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
        printf("%s ", preprocessed_parameter_boilerplate);
    // BOILERPLATE END////////////////////////////////////////////////////////
@@ -343,7 +346,6 @@ traverse(const ASTNode* node)
    if (node->type == NODE_FUNCTION_DECLARATION)
        inside_function_declaration = false;

-
    // If the node is a subscript expression and the expression list inside it is not empty
    if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
        printf("IDX(");
@@ -400,8 +402,10 @@ traverse(const ASTNode* node)
    if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
        for (int i = 0; i < num_symbols; ++i) {
            if (symbol_table[i].type_qualifier == OUT) {
-                printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
-                //printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
+                printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier,
+                       symbol_table[i].identifier);
+                // printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n",
+                // inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
            }
        }
    }
@@ -486,8 +490,8 @@ generate_preprocessed_structures(void)

    for (int i = 0; i < num_symbols; ++i) {
        if (symbol_table[i].type_qualifier == PREPROCESSED)
-            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", symbol_table[i].identifier,
-                   symbol_table[i].identifier);
+            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n",
+                   symbol_table[i].identifier, symbol_table[i].identifier);
    }
    printf("return data;\n");
    printf("}\n");
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -41,7 +41,6 @@ extern "C" {
 #include <stdlib.h>       // size_t
 #include <vector_types.h> // CUDA vector types (float4, etc)

-
 /*
 * =============================================================================
 * Flags for auto-optimization
@@ -59,7 +58,6 @@ extern "C" {
 #define NUM_ITERATIONS (10)
 #define WARP_SIZE (32)

-
 /*
 * =============================================================================
 * Compile-time constants used during simulation (user definable)
@@ -75,7 +73,8 @@ extern "C" {

 // L-prefix inherited from the old Astaroth, no idea what it means
 // MV: L means a Logical switch variale, something having true of false value.
-#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+// Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+#define LFORCING (0)
 #define LINDUCTION (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
@@ -258,28 +257,16 @@ typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
 * Reduction types
 * =============================================================================
 */
-typedef enum {
-    RTYPE_MAX,
-    RTYPE_MIN,
-    RTYPE_RMS,
-    RTYPE_RMS_EXP,
-    NUM_REDUCTION_TYPES
-} ReductionType;
+typedef enum { RTYPE_MAX, RTYPE_MIN, RTYPE_RMS, RTYPE_RMS_EXP, NUM_REDUCTION_TYPES } ReductionType;

 /*
 * =============================================================================
 * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
 * =============================================================================
 */
-typedef enum {
-    AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
-    NUM_INT_PARAM_TYPES
-} AcIntParam;
+typedef enum { AC_FOR_INT_PARAM_TYPES(AC_GEN_ID), NUM_INT_PARAM_TYPES } AcIntParam;

-typedef enum {
-    AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
-    NUM_REAL_PARAM_TYPES
-} AcRealParam;
+typedef enum { AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID), NUM_REAL_PARAM_TYPES } AcRealParam;

 extern const char* intparam_names[];  // Defined in astaroth.cu
 extern const char* realparam_names[]; // Defined in astaroth.cu
@@ -294,9 +281,7 @@ typedef struct {
 * Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
 * =============================================================================
 */
-typedef enum {
-    AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
-} VertexBufferHandle;
+typedef enum { AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES } VertexBufferHandle;

 extern const char* vtxbuf_names[]; // Defined in astaroth.cu

@@ -320,12 +305,10 @@ typedef struct {
    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *                          \
              mesh_info.int_params[AC_mz]))

-#define AC_VTXBUF_SIZE_BYTES(mesh_info)                                        \
-    (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
+#define AC_VTXBUF_SIZE_BYTES(mesh_info) (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))

 #define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                                       \
-    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] *               \
-     mesh_info.int_params[AC_nz])
+    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * mesh_info.int_params[AC_nz])

 #define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                                                 \
    (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -35,7 +35,6 @@ const char* intparam_names[]      = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
 const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
 const char* vtxbuf_names[]    = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};

-
 static const int MAX_NUM_DEVICES       = 32;
 static int num_devices                 = 1;
 static Device devices[MAX_NUM_DEVICES] = {};
@@ -44,17 +43,9 @@ static Grid
 createGrid(const AcMeshInfo& config)
 {
    Grid grid;
-    grid.m = (int3) {
-        config.int_params[AC_mx],
-        config.int_params[AC_my],
-        config.int_params[AC_mz]
-    };

-    grid.n = (int3) {
-        config.int_params[AC_nx],
-        config.int_params[AC_ny],
-        config.int_params[AC_nz]
-    };
+    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
+    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};

    return grid;
 }
@@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k)
 static int3
 gridIdx3d(const Grid& grid, const int idx)
 {
-    return (int3){idx % grid.m.x,
-                 (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
                  idx / (grid.m.x * grid.m.y)};
 }

@@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config)
    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);

+    // clang-format off
    printf("Grid m ");   printInt3(grid.m);    printf("\n");
    printf("Grid n ");   printInt3(grid.n);    printf("\n");
    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+    // clang-format on

    // Initialize the devices
    for (int i = 0; i < num_devices; ++i) {
@@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
        */
        if (db.z >= da.z) {
            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
-            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
+            const int3 da_local  = (int3){
+                da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
            copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
        }
        printf("\n");
@@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
        */
        if (db.z >= da.z) {
            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
-            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
+            const int3 da_local  = (int3){
+                da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
            copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
        }
        printf("\n");
@@ -263,8 +259,7 @@ AcResult
 acIntegrateStep(const int& isubstep, const AcReal& dt)
 {
    const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2};
-    const int3 end   = (int3){STENCIL_ORDER/2 + subgrid.n.x,
-                              STENCIL_ORDER/2 + subgrid.n.y,
+    const int3 end   = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y,
                            STENCIL_ORDER / 2 + subgrid.n.z};
    for (int i = 0; i < num_devices; ++i) {
        rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
@@ -278,9 +273,10 @@ acBoundcondStep(void)
 {
    acSynchronize();
    if (num_devices == 1) {
-        boundcondStep(devices[0], STREAM_PRIMARY,
-                      (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
-    } else {
+        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0},
+                      (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
+    }
+    else {
        // Local boundary conditions
        for (int i = 0; i < num_devices; ++i) {
            const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE
@@ -296,16 +292,16 @@ way we have to do things with multiple GPUs.
        The older approach relied on unified memory, which represented the whole
        memory area as one huge mesh instead of several smaller ones. However, unified memory
        in its current state is more meant for quick prototyping when performance is not an issue.
-Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
-when managing the memory explicitly.
+        Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
+        than when managing the memory explicitly.

        In this new approach, I have simplified the multi- and single-GPU layers significantly.
        Quick rundown:
                New struct: Grid. There are two global variables, "grid" and "subgrid", which
-	contain the extents of the whole simulation domain and the decomposed grids, respectively.
-	To simplify thing, we require that each GPU is assigned the same amount of work,
-	therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
-	to work with.
+                contain the extents of the whole simulation domain and the decomposed grids,
+        respectively. To simplify thing, we require that each GPU is assigned the same amount of
+        work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
+        work with.

                The whole simulation domain is decomposed with respect to the z dimension.
                For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
@@ -318,16 +314,16 @@ Terminology:
                - Single-GPU function: a function defined on the single-GPU layer (device.cu)

        Changes required to this commented code block:
-	- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
-	  instead. Same holds for any complex index calculations. Instead, the local coordinates
-  	  should be passed as an int3 type without having to consider how the data is actually
-	  laid out in device memory
-	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
-	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
-	  are stored in a global array "devices[num_devices]".
+                - The thread block dimensions (tpb) are no longer passed to the kernel here but in
+        device.cu instead. Same holds for any complex index calculations. Instead, the local
+        coordinates should be passed as an int3 type without having to consider how the data is
+        actually laid out in device memory
+                - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
+        handle of type "Device" which should be passed to single-GPU functions. In this file, all
+        devices are stored in a global array "devices[num_devices]".
                - Every single-GPU function is executed asynchronously by default such that we
-	  can optimize Astaroth by executing memory transactions concurrently with computation.
-	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
+                  can optimize Astaroth by executing memory transactions concurrently with
+        computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
                  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
                  as a parameter and commands executing in different streams can be processed
                  in parallel/concurrently.
@@ -337,12 +333,13 @@ Note on periodic boundaries (might be helpful when implementing other boundary c

                With multiple GPUs, periodic boundary conditions applied on indices ranging from

-		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
+                        (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
+        STENCIL_ORDER/2)

-	on a single device are "local", in the sense that they can be computed without having
-	to exchange data with neighboring GPUs. Special care is needed only for transferring
-	the data to the fron and back plates outside this range. In the solution we use here,
-	we solve the local boundaries first, and then just exchange the front and back plates
+                on a single device are "local", in the sense that they can be computed without
+        having to exchange data with neighboring GPUs. Special care is needed only for transferring
+                the data to the fron and back plates outside this range. In the solution we use
+        here, we solve the local boundaries first, and then just exchange the front and back plates
                in a "ring", like so
                                        device_id
                            (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
@@ -351,12 +348,12 @@ Note on periodic boundaries (might be helpful when implementing other boundary c
        // ======MIIKKANOTE END==========================================

        <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
-                                                          moved into device.cu, function boundCondStep()
-                                                          In astaroth.cu, we use acBoundcondStep()
-                                                          just to distribute the work and manage
-                                                          communication between GPUs.
+                                                                  moved into device.cu, function
+        boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and
+        manage communication between GPUs.

-    printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
+            printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y,
+        best_dims.z, double(best_time) / NUM_ITERATIONS);

            exit(0);
            #else
@@ -386,13 +383,15 @@ Note on periodic boundaries (might be helpful when implementing other boundary c
            {
                const int3 src = (int3){0, 0, subgrid.n.z};
                const int3 dst = (int3){0, 0, 0};
-                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
+                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src,
+                                       devices[(i + 1) % num_devices], dst, num_vertices);
            }
            // ...|ooooooo|xxx <- ...|xxxoooo|...
            {
                const int3 src = (int3){0, 0, STENCIL_ORDER / 2};
                const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z};
-                copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
+                copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src,
+                                       devices[i], dst, num_vertices);
            }
        }
    }
@@ -427,11 +426,14 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
    for (int i = 1; i < n; ++i) {
        if (rtype == RTYPE_MAX) {
            res = max(res, results[i]);
-        } else if (rtype == RTYPE_MIN) {
+        }
+        else if (rtype == RTYPE_MIN) {
            res = min(res, results[i]);
-        } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        }
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
            res = sum(res, results[i]);
-        } else {
+        }
+        else {
            ERROR("Invalid rtype");
        }
    }
@@ -445,8 +447,7 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
 }

 AcReal
-acReduceScal(const ReductionType& rtype,
-             const VertexBufferHandle& vtxbuffer_handle)
+acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
 {
    AcReal results[num_devices];
    for (int i = 0; i < num_devices; ++i) {
@@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype,
 }

 AcReal
-acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
-            const VertexBufferHandle& b, const VertexBufferHandle& c)
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
+            const VertexBufferHandle& c)
 {
    AcReal results[num_devices];
    for (int i = 0; i < num_devices; ++i) {
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -76,46 +76,46 @@ printDeviceInfo(const Device device)
    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
    printf("    Stream processors: %d\n", props.multiProcessorCount);
    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
-    printf("    Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    printf(
+        "    Compute mode: %d\n",
+        (int)props
+            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
    // Memory
    printf("  Global memory\n");
    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
-           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
-               (8. * 1024. * 1024. * 1024.));
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
    printf("    ECC enabled: %d\n", props.ECCEnabled);
    // Memory usage
    size_t free_bytes, total_bytes;
    cudaMemGetInfo(&free_bytes, &total_bytes);
    const size_t used_bytes = total_bytes - free_bytes;
-    printf("    Total global mem: %.2f GiB\n",
-           props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory free (GiB): %.2f\n",
-           free_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory total (GiB): %.2f\n",
-           total_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
    printf("  Caches\n");
    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
-    printf("    Shared mem per block: %ld KiB\n",
-           props.sharedMemPerBlock / (1024));
+    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
    printf("  Other\n");
    printf("    Warp size: %d\n", props.warpSize);
    // printf("    Single to double perf. ratio: %dx\n",
    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
    // versions
-    printf("    Stream priorities supported: %d\n",
-           props.streamPrioritiesSupported);
+    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
    printf("--------------------------------------------------\n");

    return AC_SUCCESS;
 }

-static __global__ void dummy_kernel(void) {}
+static __global__ void
+dummy_kernel(void)
+{
+}

 AcResult
 createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
@@ -150,15 +150,14 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
    }
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
-                                  AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMalloc(&device->reduce_scratchpad, AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));

    // Device constants
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
                                          cudaMemcpyHostToDevice));

-
    // Multi-GPU offset. This is used to compute globalVertexIdx.
    // Might be better to calculate this in astaroth.cu instead of here, s.t.
    // everything related to the decomposition is limited to the multi-GPU layer
@@ -166,7 +165,6 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));

-
    printf("Created device %d (%p)\n", device->id, device);
    *device_handle = device;
    return AC_SUCCESS;
@@ -213,51 +211,42 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp

    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
                              device->local_config.int_params[AC_ny_min],
-                               device->local_config.int_params[AC_nz_min]
-    };
+                              device->local_config.int_params[AC_nz_min]};

    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
                            device->local_config.int_params[AC_ny_max],
-                             device->local_config.int_params[AC_nz_max]
-    };
+                            device->local_config.int_params[AC_nz_max]};

-    *result = reduce_scal(device->streams[stream_type], rtype,
-                          start, end, device->vba.in[vtxbuf_handle],
-                          device->reduce_scratchpad, device->reduce_result);
+    *result = reduce_scal(device->streams[stream_type], rtype, start, end,
+                          device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
+                          device->reduce_result);
    return AC_SUCCESS;
 }

 AcResult
-reduceVec(const Device device, const StreamType stream_type,
-          const ReductionType rtype,
-          const VertexBufferHandle vtxbuf0,
-          const VertexBufferHandle vtxbuf1,
-          const VertexBufferHandle vtxbuf2,
-          AcReal* result)
+reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
+          const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+          const VertexBufferHandle vtxbuf2, AcReal* result)
 {
    cudaSetDevice(device->id);

    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
                              device->local_config.int_params[AC_ny_min],
-                               device->local_config.int_params[AC_nz_min]
-    };
+                              device->local_config.int_params[AC_nz_min]};

    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
                            device->local_config.int_params[AC_ny_max],
-                             device->local_config.int_params[AC_nz_max]
-    };
+                            device->local_config.int_params[AC_nz_max]};

-    *result = reduce_vec(device->streams[stream_type], rtype, start, end,
-                          device->vba.in[vtxbuf0],
-                          device->vba.in[vtxbuf1],
-                          device->vba.in[vtxbuf2],
+    *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
+                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
                         device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
 }

 AcResult
-rkStep(const Device device, const StreamType stream_type, const int step_number,
-       const int3& start, const int3& end, const AcReal dt)
+rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
+       const int3& end, const AcReal dt)
 {
    cudaSetDevice(device->id);
    rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
@@ -270,65 +259,62 @@ synchronize(const Device device, const StreamType stream_type)
    cudaSetDevice(device->id);
    if (stream_type == STREAM_ALL) {
        cudaDeviceSynchronize();
-    } else {
+    }
+    else {
        cudaStreamSynchronize(device->streams[stream_type]);
    }
    return AC_SUCCESS;
 }

 static AcResult
-loadWithOffset(const Device device, const StreamType stream_type,
-               const AcReal* src, const size_t bytes, AcReal* dst)
+loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
+               const size_t bytes, AcReal* dst)
 {
    cudaSetDevice(device->id);
-    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
-                                device->streams[stream_type]));
+    ERRCHK_CUDA(
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
    return AC_SUCCESS;
 }

 static AcResult
-storeWithOffset(const Device device, const StreamType stream_type,
-                const AcReal* src, const size_t bytes, AcReal* dst)
+storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
+                const size_t bytes, AcReal* dst)
 {
    cudaSetDevice(device->id);
-    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
-                                device->streams[stream_type]));
+    ERRCHK_CUDA(
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
    return AC_SUCCESS;
 }

 AcResult
-copyMeshToDevice(const Device device, const StreamType stream_type,
-                 const AcMesh& host_mesh, const int3& src, const int3& dst,
-                 const int num_vertices)
+copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
+                 const int3& src, const int3& dst, const int num_vertices)
 {
    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
-                       &device->vba.in[i][dst_idx]);
+        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
+                       num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
    }
    return AC_SUCCESS;
 }

 AcResult
-copyMeshToHost(const Device device, const StreamType stream_type,
-               const int3& src, const int3& dst, const int num_vertices,
-               AcMesh* host_mesh)
+copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
+               const int num_vertices, AcMesh* host_mesh)
 {
    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
-                        num_vertices * sizeof(AcReal),
-                        &host_mesh->vertex_buffer[i][dst_idx]);
+                        num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
    }
    return AC_SUCCESS;
 }

 AcResult
-copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
-                       const int3& src, Device dst_device, const int3& dst,
-                       const int num_vertices)
+copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
+                       Device dst_device, const int3& dst, const int num_vertices)
 {
    cudaSetDevice(src_device->id);
    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
@@ -364,8 +350,8 @@ loadDeviceConstant(const Device device, const AcIntParam param, const int value)
    // Therefore we have to obfuscate the code a bit and compute the offset address before
    // invoking cudaMemcpyToSymbol.
    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
-                                          offset, cudaMemcpyHostToDevice));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
    return AC_SUCCESS;
 }

@@ -374,8 +360,8 @@ loadDeviceConstant(const Device device, const AcRealParam param, const AcReal va
 {
    cudaSetDevice(device->id);
    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
-                                          offset, cudaMemcpyHostToDevice));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
    return AC_SUCCESS;
 }

@@ -383,7 +369,7 @@ AcResult
 loadGlobalGrid(const Device device, const Grid grid)
 {
    cudaSetDevice(device->id);
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid),
-                                          0, cudaMemcpyHostToDevice));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
    return AC_SUCCESS;
 }
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -27,12 +27,14 @@
 #pragma once
 #include "astaroth.h"

+// clang-format off
 typedef enum {
    STREAM_PRIMARY,
    STREAM_SECONDARY,
    NUM_STREAM_TYPES,
    STREAM_ALL
 } StreamType;
+// clang-format on

 typedef struct {
    int3 m;
@@ -52,20 +54,17 @@ AcResult createDevice(const int id, const AcMeshInfo device_config, Device* devi
 AcResult destroyDevice(Device device);

 /** */
-AcResult boundcondStep(const Device device, const StreamType stream_type,
-                       const int3& start, const int3& end);
+AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
+                       const int3& end);

 /** */
 AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
                    const VertexBufferHandle vtxbuf_handle, AcReal* result);

 /** */
-AcResult reduceVec(const Device device, const StreamType stream_type,
-                   const ReductionType rtype,
-                   const VertexBufferHandle vec0,
-                   const VertexBufferHandle vec1,
-                   const VertexBufferHandle vec2,
-                   AcReal* result);
+AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
+                   const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                   const VertexBufferHandle vec2, AcReal* result);

 /** */
 AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
@@ -81,9 +80,8 @@ AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
                          const int num_vertices);

 /** */
-AcResult copyMeshToHost(const Device device, const StreamType stream_type,
-                        const int3& src, const int3& dst, const int num_vertices,
-                        AcMesh* host_mesh);
+AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
+                        const int3& dst, const int num_vertices, AcMesh* host_mesh);

 /** */
 AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -89,7 +89,6 @@ periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& en
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <assert.h>

-
 static __device__ __forceinline__ int
 IDX(const int i)
 {
@@ -120,7 +119,6 @@ create_rotz(const AcReal radians)
    return mat;
 }

-
 #if AC_DOUBLE_PRECISION == 0
 #define sin __sinf
 #define cos __cosf
@@ -128,7 +126,6 @@ create_rotz(const AcReal radians)
 #define rsqrt rsqrtf // hardware reciprocal sqrt
 #endif               // AC_DOUBLE_PRECISION == 0

-
 /*
 typedef struct {
    int i, j, k;
@@ -155,8 +152,7 @@ first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 #elif STENCIL_ORDER == 6
    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
 #elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
-                                   -1.0 / 280.0};
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, -1.0 / 280.0};
 #endif

 #define MID (STENCIL_ORDER / 2)
@@ -177,11 +173,9 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 #elif STENCIL_ORDER == 4
    const AcReal coefficients[] = {-5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
 #elif STENCIL_ORDER == 6
-    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
-                                   1.0 / 90.0};
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, 1.0 / 90.0};
 #elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
-                                   8.0 / 315.0, -1.0 / 560.0};
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, 8.0 / 315.0, -1.0 / 560.0};
 #endif

 #define MID (STENCIL_ORDER / 2)
@@ -196,22 +190,20 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)

 /** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
 static __device__ __forceinline__ AcReal
-cross_derivative(const AcReal* __restrict__ pencil_a,
-                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
-                 const AcReal inv_ds_b)
+cross_derivative(const AcReal* __restrict__ pencil_a, const AcReal* __restrict__ pencil_b,
+                 const AcReal inv_ds_a, const AcReal inv_ds_b)
 {
 #if STENCIL_ORDER == 2
    const AcReal coefficients[] = {0, 1.0 / 4.0};
 #elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+    const AcReal coefficients[] = {
+        0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
 #elif STENCIL_ORDER == 6
    const AcReal fac            = (1. / 720.);
-    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
-                                   2.0 * fac};
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, 2.0 * fac};
 #elif STENCIL_ORDER == 8
    const AcReal fac            = (1. / 20160.);
-    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
-                                   128. * fac, -9. * fac};
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, 128. * fac, -9. * fac};
 #endif

 #define MID (STENCIL_ORDER / 2)
@@ -219,8 +211,8 @@ cross_derivative(const AcReal* __restrict__ pencil_a,

 #pragma unroll
    for (int i = 1; i <= MID; ++i) {
-        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
-                                  pencil_b[MID + i] - pencil_b[MID - i]);
+        res += coefficients[i] *
+               (pencil_a[MID + i] + pencil_a[MID - i] - pencil_b[MID + i] - pencil_b[MID - i]);
    }
    return res * inv_ds_a * inv_ds_b;
 }
@@ -231,7 +223,8 @@ derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                 vertexIdx.z)];

    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
 }
@@ -242,7 +235,8 @@ derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                 vertexIdx.z)];

    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
 }
@@ -262,8 +256,7 @@ derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];

-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
-                            DCONST_REAL(AC_inv_dsy));
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsy));
 }

 static __device__ __forceinline__ AcReal
@@ -281,8 +274,7 @@ derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];

-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
-                            DCONST_REAL(AC_inv_dsz));
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsz));
 }

 static __device__ __forceinline__ AcReal
@@ -291,7 +283,8 @@ dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                 vertexIdx.z)];

    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
 }
@@ -302,7 +295,8 @@ deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                 vertexIdx.z)];

    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
 }
@@ -322,8 +316,7 @@ deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];

-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
-                            DCONST_REAL(AC_inv_dsz));
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), DCONST_REAL(AC_inv_dsz));
 }

 static __device__ __forceinline__ AcReal
@@ -332,7 +325,8 @@ derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
+                                 vertexIdx.z + offset - STENCIL_ORDER / 2)];

    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
 }
@@ -343,7 +337,8 @@ derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
+                                 vertexIdx.z + offset - STENCIL_ORDER / 2)];

    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
 }
@@ -401,8 +396,7 @@ operator-(const AcReal3& a)
    return (AcReal3){-a.x, -a.y, -a.z};
 }

-static __host__  __device__ __forceinline__ AcReal3
-operator*(const AcReal a, const AcReal3& b)
+static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
@@ -443,7 +437,6 @@ is_valid(const AcReal3& a)
    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
 }

-
 /*
 * =============================================================================
 * Level 1 (Stencil Processing Stage)
@@ -476,8 +469,7 @@ laplace_vec(const AcReal3Data& vec)
 static __device__ __forceinline__ AcReal3
 curl(const AcReal3Data& vec)
 {
-    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
-                     gradient(vec.x).z - gradient(vec.z).x,
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, gradient(vec.x).z - gradient(vec.z).x,
                     gradient(vec.y).x - gradient(vec.x).y};
 }

@@ -561,7 +553,8 @@ forcing(const int i, const int j, const int k)
 #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
 #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
 #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
-    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+    const AcReal3 k_vec = (AcReal3){
+        (i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
        (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
        (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
    AcReal inv_len = reciprocal_len(k_vec);
@@ -576,26 +569,22 @@ forcing(const int i, const int j, const int k)
    return inv_len * inv_len * waves * forcing_vec;
 }

-
-// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values
+// in the mesh, then we will inherently lose precision
 #define LNT0 (AcReal(0.0))
 #define LNRHO0 (AcReal(0.0))

 #define H_CONST (AcReal(0.0))
 #define C_CONST (AcReal(0.0))

-
-
 template <int step_number>
 static __device__ __forceinline__ AcReal
-rk3_integrate(const AcReal state_previous, const AcReal state_current,
-              const AcReal rate_of_change, const AcReal dt)
+rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
+              const AcReal dt)
 {
    // Williamson (1980)
    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
-                           AcReal(8. / 15.)};
-
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};

    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
    // access (when accessing beta[step_number-1] even when step_number >= 1)
@@ -605,8 +594,7 @@ rk3_integrate(const AcReal state_previous, const AcReal state_current,
    case 1: // Fallthrough
    case 2:
        return state_current +
-               beta[step_number + 1] *
-                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
                                            (state_current - state_previous) +
                                        rate_of_change * dt);
    default:
@@ -646,7 +634,8 @@ static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
              const AcReal3 rate_of_change, const AcReal dt)
 {
-    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+    return (AcReal3){
+        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
 }
@@ -708,8 +697,7 @@ read_out(const int idx, AcReal* __restrict__ field[], const int handle)
 static __device__ AcReal3
 read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 {
-    return (AcReal3) { read_out(idx, field, handle.x),
-                                       read_out(idx, field, handle.y),
+    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
                     read_out(idx, field, handle.z)};
 }

@@ -718,10 +706,10 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 #define READ_OUT(handle) (read_out(idx, buffer.out, handle))

 // also write for clarity here also, not for the DSL
-//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the
+// function

-#define GEN_KERNEL_PARAM_BOILERPLATE \
-        const int3 start, const int3 end, VertexBufferArray buffer
+#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer

 #define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
@@ -732,7 +720,6 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
                                        d_multigpu_offset.z + vertexIdx.z};                        \
    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
        return;                                                                                    \
-\
                                                                                                   \
    assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&           \
           vertexIdx.z < DCONST_INT(AC_nz_max));                                                   \
@@ -757,8 +744,8 @@ randf(void)
 }

 AcResult
-rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
-               const AcReal dt, VertexBufferArray* buffer)
+rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start,
+               const int3& end, const AcReal dt, VertexBufferArray* buffer)
 {
    const dim3 tpb(32, 1, 4);
 /////////////////// Forcing
@@ -771,7 +758,8 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);

    const AcReal ff_phi = AcReal(M_PI); // AcReal(2 * M_PI) * randf();
-    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice,
+                            stream);
 #endif // LFORCING
    //////////////////////////

@@ -779,12 +767,9 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
    const int ny = end.y - start.y;
    const int nz = end.z - start.z;

-    const dim3 bpg(
-        (unsigned int)ceil(nx / AcReal(tpb.x)),
-        (unsigned int)ceil(ny / AcReal(tpb.y)),
+    const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)),
                   (unsigned int)ceil(nz / AcReal(tpb.z)));

-
    if (step_number == 0)
        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
    else if (step_number == 1)
@@ -796,7 +781,6 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
    return AC_SUCCESS;
 }

-
 ////////////////REDUCE///////////////////////////
 #include "src/core/math_utils.h" // is_power_of_two

@@ -848,22 +832,19 @@ template <FilterFunc filter>
 __global__ void
 kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
 {
-    const int3 src_idx = (int3) {
-        start.x + threadIdx.x + blockIdx.x * blockDim.x,
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-        start.z + threadIdx.z + blockIdx.z * blockDim.z
-    };
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};

    const int nx       = end.x - start.x;
    const int ny       = end.y - start.y;
    const int nz       = end.z - start.z; // MV: Added this because it was undefined
-    const int3 dst_idx = (int3) {
-        threadIdx.x + blockIdx.x * blockDim.x,
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
                                threadIdx.y + blockIdx.y * blockDim.y,
-        threadIdx.z + blockIdx.z * blockDim.z
-    };
+                                threadIdx.z + blockIdx.z * blockDim.z};

-    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
+    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
+           src_idx.z < DCONST_INT(AC_nz_max));
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);

@@ -872,31 +853,27 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end,

 template <FilterFuncVec filter>
 __global__ void
-kernel_filter_vec(const __restrict__ AcReal* src0,
-                  const __restrict__ AcReal* src1,
-                  const __restrict__ AcReal* src2,
-                  const int3 start, const int3 end, AcReal* dst)
+kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
+                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
 {
-    const int3 src_idx = (int3) {
-        start.x + threadIdx.x + blockIdx.x * blockDim.x,
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-        start.z + threadIdx.z + blockIdx.z * blockDim.z
-    };
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};

    const int nx       = end.x - start.x;
    const int ny       = end.y - start.y;
    const int nz       = end.z - start.z; // MV: Added this because it was undefined
-    const int3 dst_idx = (int3) {
-        threadIdx.x + blockIdx.x * blockDim.x,
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
                                threadIdx.y + blockIdx.y * blockDim.y,
-        threadIdx.z + blockIdx.z * blockDim.z
-    };
+                                threadIdx.z + blockIdx.z * blockDim.z};

-    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
+    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
+           src_idx.z < DCONST_INT(AC_nz_max));
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);

-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
+        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
 }

 template <ReduceFunc reduce>
@@ -908,7 +885,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
    extern __shared__ AcReal smem[];
    if (idx < num_elems) {
        smem[threadIdx.x] = scratchpad[idx];
-    } else {
+    }
+    else {
        smem[threadIdx.x] = NAN;
    }
    __syncthreads();
@@ -930,9 +908,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)

 template <ReduceFunc reduce>
 __global__ void
-kernel_reduce_block(const __restrict__ AcReal* scratchpad,
-                    const int num_blocks, const int block_size,
-                    AcReal* result)
+kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
+                    const int block_size, AcReal* result)
 {
    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx != 0) {
@@ -946,11 +923,9 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad,
    *result = res;
 }

-
 AcReal
-reduce_scal(const cudaStream_t stream, const ReductionType rtype,
-            const int3& start, const int3& end,
-            const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
+reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
 {
    const unsigned nx        = end.x - start.x;
    const unsigned ny        = end.y - start.y;
@@ -958,11 +933,9 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
    const unsigned num_elems = nx * ny * nz;

    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter(
-        (unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-        (unsigned int)ceil(nz / AcReal(tpb_filter.z))
-    );
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));

    const int tpb_reduce = 128;
    const int bpg_reduce = num_elems / tpb_reduce;
@@ -974,22 +947,38 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
    ERRCHK(nx * ny * nz % 2 == 0);

    if (rtype == RTYPE_MAX) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
+        kernel_filter<dvalue>
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dmax>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_MIN) {
+        kernel_filter<dvalue>
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dmin>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_RMS) {
+        kernel_filter<dsquared>
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dsum>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter<dexp_squared>
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dsum>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else {
        ERROR("Unrecognized rtype");
    }
    AcReal result;
@@ -998,10 +987,9 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
 }

 AcReal
-reduce_vec(const cudaStream_t stream, const ReductionType rtype,
-            const int3& start, const int3& end,
-            const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
-            AcReal* scratchpad, AcReal* reduce_result)
+reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
+           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
+           AcReal* reduce_result)
 {
    const unsigned nx        = end.x - start.x;
    const unsigned ny        = end.y - start.y;
@@ -1009,11 +997,9 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
    const unsigned num_elems = nx * ny * nz;

    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter(
-        (unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-        (unsigned int)ceil(nz / AcReal(tpb_filter.z))
-    );
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));

    const int tpb_reduce = 128;
    const int bpg_reduce = num_elems / tpb_reduce;
@@ -1025,22 +1011,38 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
    ERRCHK(nx * ny * nz % 2 == 0);

    if (rtype == RTYPE_MAX) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dmax>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_MIN) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dmin>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_RMS) {
+        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dsum>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dsum>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else {
        ERROR("Unrecognized rtype");
    }
    AcReal result;
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -60,7 +60,6 @@ typedef struct {
    AcReal x, y, z;
 } vec3r;

-
 typedef struct {
    ModelScalar model;
    AcReal candidate;
@@ -71,10 +70,13 @@ typedef struct {
 #define THOROUGH_TEST (1)
 #define TEST_TYPE QUICK_TEST

-static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE, INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
+static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE,
+                                      INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
 // #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))

-#if TEST_TYPE == QUICK_TEST // REGULAR TEST START HERE --------------------------------------------------------------------------------------------------------------
+#if TEST_TYPE ==                                                                                   \
+    QUICK_TEST // REGULAR TEST START HERE
+               // --------------------------------------------------------------------------------------------------------------
 static inline ModelScalar
 get_absolute_error(const ModelScalar& model, const AcReal& candidate)
 {
@@ -152,46 +154,37 @@ get_reduction_range(const ModelMesh& mesh)
 {
    ERRCHK(NUM_VTXBUF_HANDLES >= 3);

-	const ModelScalar max0     = model_reduce_scal(mesh, RTYPE_MAX,
-			VertexBufferHandle(0));
-	const ModelScalar max1     = model_reduce_scal(mesh, RTYPE_MAX,
-			VertexBufferHandle(1));
-	const ModelScalar max2     = model_reduce_scal(mesh, RTYPE_MAX,
-			VertexBufferHandle(2));
+    const ModelScalar max0     = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(0));
+    const ModelScalar max1     = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(1));
+    const ModelScalar max2     = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(2));
    const ModelScalar max_scal = max(max0, max(max1, max2));

-	const ModelScalar min0     = model_reduce_scal(mesh, RTYPE_MIN,
-			VertexBufferHandle(0));
-	const ModelScalar min1     = model_reduce_scal(mesh, RTYPE_MIN,
-			VertexBufferHandle(1));
-	const ModelScalar min2     = model_reduce_scal(mesh, RTYPE_MIN,
-			VertexBufferHandle(2));
+    const ModelScalar min0     = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(0));
+    const ModelScalar min1     = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(1));
+    const ModelScalar min2     = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(2));
    const ModelScalar min_scal = min(min0, min(min1, min2));

    return max_scal - min_scal;
 }

 static void
-print_debug_info(const ModelScalar& model, const AcReal& candidate,
-		const ModelScalar& range)
+print_debug_info(const ModelScalar& model, const AcReal& candidate, const ModelScalar& range)
 {
    printf("MeshPointInfo\n");
    printf("\tModel: %e\n", double(model));
    printf("\tCandidate: %e\n", double(candidate));
    printf("\tRange: %e\n", double(range));

-	printf("\tAbsolute error: %Le (max acceptable: %Le)\n",
-			get_absolute_error(model, candidate),
+    printf("\tAbsolute error: %Le (max acceptable: %Le)\n", get_absolute_error(model, candidate),
           get_acceptable_absolute_error(range));
-	printf("\tRelative error: %Le (max acceptable: %Le)\n",
-			get_relative_error(model, candidate),
+    printf("\tRelative error: %Le (max acceptable: %Le)\n", get_relative_error(model, candidate),
           get_acceptable_relative_error());
    printf("\tIs acceptable: %d\n", verify(model, candidate, range));
 }

 static void
-print_result(const ModelScalar& model, const AcReal& candidate,
-		const ModelScalar& range, const char* name = "???")
+print_result(const ModelScalar& model, const AcReal& candidate, const ModelScalar& range,
+             const char* name = "???")
 {
    const ModelScalar rel_err = get_relative_error(model, candidate);
    const ModelScalar abs_err = get_absolute_error(model, candidate);
@@ -247,8 +240,7 @@ check_reductions(const AcMeshInfo& config)
            // Scal
            ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype),
                                                  VertexBufferHandle(ftype));
-			AcReal candidate  = acReduceScal(ReductionType(rtype),
-					VertexBufferHandle(ftype));
+            AcReal candidate  = acReduceScal(ReductionType(rtype), VertexBufferHandle(ftype));
            print_result(model, candidate, range, "UUX scal");

            bool is_acceptable = verify(model, candidate, range);
@@ -261,10 +253,9 @@ check_reductions(const AcMeshInfo& config)
            }

            // Vec
-			model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX,
-					VTXBUF_UUY, VTXBUF_UUZ);
-			candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX,
-					VTXBUF_UUY, VTXBUF_UUZ);
+            model     = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX, VTXBUF_UUY,
+                                     VTXBUF_UUZ);
+            candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
            print_result(model, candidate, range, "UUXYZ vec");

            is_acceptable = verify(model, candidate, range);
@@ -277,7 +268,8 @@ check_reductions(const AcMeshInfo& config)
            }
        }

-		printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+        printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n",
+               get_acceptable_relative_error(), get_acceptable_absolute_error(range));
    }
    acQuit();
    modelmesh_destroy(modelmesh);
@@ -296,8 +288,10 @@ get_data_range(const ModelMesh& model)
    ModelScalar vertex_buffer_max_all = -INFINITY;
    ModelScalar vertex_buffer_min_all = INFINITY;
    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
-		const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX, VertexBufferHandle(w));
-		const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN, VertexBufferHandle(w));
+        const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX,
+                                                                VertexBufferHandle(w));
+        const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN,
+                                                                VertexBufferHandle(w));

        if (vertex_buffer_max > vertex_buffer_max_all)
            vertex_buffer_max_all = vertex_buffer_max;
@@ -335,18 +329,16 @@ verify_meshes(const ModelMesh& model, const AcMesh& candidate)

            if (!verify(model_val, cand_val, range)) {
                const int i0 = i % model.info.int_params[AC_mx];
-				const int j0 = ((i % (model.info.int_params[AC_mx] *
-								model.info.int_params[AC_my])) /
+                const int j0 = ((i %
+                                 (model.info.int_params[AC_mx] * model.info.int_params[AC_my])) /
                                model.info.int_params[AC_mx]);
-				const int k0 = i / (model.info.int_params[AC_mx] *
-						model.info.int_params[AC_my]);
+                const int k0 = i / (model.info.int_params[AC_mx] * model.info.int_params[AC_my]);
                printf("Index (%d, %d, %d)\n", i0, j0, k0);
                print_debug_info(model_val, cand_val, range);
                retval = false;
            }

-			const ModelScalar abs_error = get_absolute_error(model_val,
-					cand_val);
+            const ModelScalar abs_error = get_absolute_error(model_val, cand_val);
            if (abs_error > max_abs_error.error) {
                max_abs_error.error     = abs_error;
                max_abs_error.model     = model_val;
@@ -368,8 +360,10 @@ verify_meshes(const ModelMesh& model, const AcMesh& candidate)
            }
 #endif
        }
-		//print_result(max_rel_error.model, max_rel_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
-		print_result(max_abs_error.model, max_abs_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
+        // print_result(max_rel_error.model, max_rel_error.candidate, range,
+        // vtxbuf_names[VertexBufferHandle(w)]);
+        print_result(max_abs_error.model, max_abs_error.candidate, range,
+                     vtxbuf_names[VertexBufferHandle(w)]);
    }

 #if GEN_TEST_RESULT == 1
@@ -378,7 +372,8 @@ verify_meshes(const ModelMesh& model, const AcMesh& candidate)
    fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err);
 #endif

-	printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+    printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n",
+           get_acceptable_relative_error(), get_acceptable_absolute_error(range));

    return retval;
 }
@@ -387,8 +382,7 @@ verify_meshes(const ModelMesh& model, const AcMesh& candidate)
 check_rk3(const AcMeshInfo& mesh_info)
 {
    const int num_iterations = 1; // Note: should work up to at least 15 steps
-	printf("Testing RK3 (running %d steps before checking the result)\n",
-			num_iterations);
+    printf("Testing RK3 (running %d steps before checking the result)\n", num_iterations);
    int num_failures = 0;

    // Init CPU meshes
@@ -412,7 +406,8 @@ check_rk3(const AcMeshInfo& mesh_info)
        boundconds(model_mesh->info, model_mesh);

        for (int i = 0; i < num_iterations; ++i) {
-			//const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+            // const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+            // VTXBUF_UUZ));
            // const AcReal dt   = host_timestep(umax, mesh_info);
            const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities

@@ -443,7 +438,8 @@ run_autotest(void)
 {
 #if GEN_TEST_RESULT == 1
    char testresult_path[256];
-	sprintf(testresult_path, "%s_fullstep_testresult.out", AC_DOUBLE_PRECISION ? "double" : "float");
+    sprintf(testresult_path, "%s_fullstep_testresult.out",
+            AC_DOUBLE_PRECISION ? "double" : "float");

    test_result = fopen(testresult_path, "w");
    ERRCHK(test_result);
@@ -456,17 +452,13 @@ run_autotest(void)
    load_config(&config);

    if (STENCIL_ORDER > 6)
-		printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen.");
+        printf("WARNING!!! If the stencil order is larger than the computational domain some "
+               "vertices may be done twice (f.ex. doing inner and outer domains separately and "
+               "some of the front/back/left/right/etc slabs collide). The mesh must be large "
+               "enough s.t. this doesn't happen.");

-    const vec3i test_dims[] = {
-        {32, 32, 32},
-        {64, 32, 32},
-        {32, 64, 32},
-        {32, 32, 64},
-        {64, 64, 32},
-        {64, 32, 64},
-        {32, 64, 64}
-    };
+    const vec3i test_dims[] = {{32, 32, 32}, {64, 32, 32}, {32, 64, 32}, {32, 32, 64},
+                               {64, 64, 32}, {64, 32, 64}, {32, 64, 64}};

    int num_failures = 0;
    for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
@@ -509,7 +501,9 @@ run_autotest(void)
        return EXIT_SUCCESS;
 }

-#elif TEST_TYPE == THOROUGH_TEST // GEN TEST FILE START HERE --------------------------------------------------------------------------------------------------------------
+#elif TEST_TYPE ==                                                                                 \
+    THOROUGH_TEST // GEN TEST FILE START HERE
+                  // --------------------------------------------------------------------------------------------------------------
 typedef struct {
    ModelScalar model;
    AcReal candidate;
@@ -520,7 +514,8 @@ typedef struct {
    ModelScalar minimum_magnitude;
 } Error;

-Error get_error(ModelScalar model, AcReal candidate)
+Error
+get_error(ModelScalar model, AcReal candidate)
 {
    Error error;
    error.abs_error = 0;
@@ -532,11 +527,13 @@ Error get_error(ModelScalar model, AcReal candidate)
        error.abs_error = 0;
        error.rel_error = 0;
        error.ulp_error = 0;
-	} else if (!is_valid(error.model) || !is_valid(error.candidate)) {
+    }
+    else if (!is_valid(error.model) || !is_valid(error.candidate)) {
        error.abs_error = INFINITY;
        error.rel_error = INFINITY;
        error.ulp_error = INFINITY;
-	} else {
+    }
+    else {
        const int base = 2;
        const int p    = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant

@@ -552,14 +549,16 @@ Error get_error(ModelScalar model, AcReal candidate)
    return error;
 }

-Error get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+Error
+get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
 {
    Error error;
    error.abs_error = -1;

    for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) {
        for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
-			Error curr_error = get_error(model_mesh.vertex_buffer[j][i], candidate_mesh.vertex_buffer[j][i]);
+            Error curr_error = get_error(model_mesh.vertex_buffer[j][i],
+                                         candidate_mesh.vertex_buffer[j][i]);
            if (curr_error.abs_error > error.abs_error)
                error = curr_error;
        }
@@ -582,7 +581,6 @@ get_maximum_magnitude(const ModelScalar* field, const AcMeshInfo info)
    return maximum;
 }

-
 static ModelScalar
 get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
 {
@@ -594,7 +592,9 @@ get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
    return minimum;
 }

-Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+Error
+get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh,
+                         const AcMesh& candidate_mesh)
 {
    ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle];
    AcReal* candidate_vtxbuf  = candidate_mesh.vertex_buffer[vtxbuf_handle];
@@ -610,7 +610,6 @@ Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const Mod
            error = curr_error;
    }

-
    error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
    error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);

@@ -621,14 +620,17 @@ void
 print_error_to_file(const char* path, const int n, const Error error)
 {
    FILE* file = fopen(path, "a");
-    fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error, error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
-    //fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude, error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
+    fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error,
+            error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
+    // fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude,
+    // error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
    fclose(file);
 }

 #define MAX_PATH_LEN (256)

-int run_autotest(void)
+int
+run_autotest(void)
 {

 #define N_MIN (32)
@@ -660,31 +662,41 @@ int run_autotest(void)
                acStore(candidate_mesh);
                Error boundcond_error = get_max_abs_error_mesh(*model_mesh, *candidate_mesh);
                char boundcond_path[MAX_PATH_LEN];
-                sprintf(boundcond_path, "%s_boundcond_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                sprintf(boundcond_path, "%s_boundcond_%s.testresult",
+                        AC_DOUBLE_PRECISION ? "double" : "float",
+                        init_type_names[(InitType)init_type]);
                print_error_to_file(boundcond_path, n, boundcond_error);
            }

            { // Check scalar max reduction
-                ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX);
+                ModelScalar model         = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX,
+                                                      VTXBUF_UUX);
                AcReal candidate          = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX);
                Error scalar_reduce_error = get_error(model, candidate);
                char scalar_reduce_path[MAX_PATH_LEN];
-                sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult",
+                        AC_DOUBLE_PRECISION ? "double" : "float",
+                        init_type_names[(InitType)init_type]);
                print_error_to_file(scalar_reduce_path, n, scalar_reduce_error);
            }

            { // Check vector max reduction
-                ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-                AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX,
+                                                     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                AcReal candidate  = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                               VTXBUF_UUZ);
                Error vector_reduce_error = get_error(model, candidate);
                char vector_reduce_path[MAX_PATH_LEN];
-                sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult",
+                        AC_DOUBLE_PRECISION ? "double" : "float",
+                        init_type_names[(InitType)init_type]);
                print_error_to_file(vector_reduce_path, n, vector_reduce_error);
            }

            // Time advance
            {
-                const AcReal umax =  (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                const AcReal umax = (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX,
+                                                             VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
                const AcReal dt   = host_timestep(umax, config);

                // Host integration step
@@ -699,12 +711,18 @@ int run_autotest(void)

                // Check fields
                for (int vtxbuf_handle = 0; vtxbuf_handle < NUM_VTXBUF_HANDLES; ++vtxbuf_handle) {
-                    Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle, *model_mesh, *candidate_mesh);
+                    Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle,
+                                                                 *model_mesh, *candidate_mesh);

-			printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n", (ModelScalar)field_error.model, (ModelScalar)field_error.candidate, (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);
+                    printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n",
+                           (ModelScalar)field_error.model, (ModelScalar)field_error.candidate,
+                           (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);

                    char field_path[MAX_PATH_LEN];
-                    sprintf(field_path, "%s_integrationstep_%s_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type], vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
+                    sprintf(field_path, "%s_integrationstep_%s_%s.testresult",
+                            AC_DOUBLE_PRECISION ? "double" : "float",
+                            init_type_names[(InitType)init_type],
+                            vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
                    print_error_to_file(field_path, n, field_error);
                }
            }
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -35,10 +35,10 @@
 #include "model/model_rk3.h"
 #include "timer_hires.h"

-#include <vector>
+#include "src/core/errchk.h"
 #include <algorithm>
 #include <math.h>
-#include "src/core/errchk.h"
+#include <vector>

 static bool
 smaller_than(const double& a, const double& b)
@@ -47,7 +47,8 @@ smaller_than(const double& a, const double& b)
 }

 static int
-write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc)
+write_runningtimes(const char* path, const int n, const double min, const double max,
+                   const double median, const double perc)
 {
    FILE* fp;
    fp = fopen(path, "a");
@@ -80,7 +81,8 @@ int
 run_benchmark(void)
 {
    char runningtime_path[256];
-    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float",
+            GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");

    FILE* fp;
    fp = fopen(runningtime_path, "w");
@@ -88,7 +90,8 @@ run_benchmark(void)
    if (fp != NULL) {
        fprintf(fp, "n, min, max, median, perc\n");
        fclose(fp);
-    } else {
+    }
+    else {
        return EXIT_FAILURE;
    }

@@ -113,7 +116,6 @@ run_benchmark(void)
        std::vector<double> results;
        results.reserve(NUM_ITERS);

-
        // Warmup
        for (int i = 0; i < 10; ++i) {
            acIntegrate(0);
@@ -128,7 +130,8 @@ run_benchmark(void)
            acIntegrateStep(2, FLT_EPSILON);
 #else // GEN_BENCHMARK_FULL
            // const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-            const AcReal dt   = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
+            const AcReal dt = AcReal(
+                1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
            acIntegrate(dt);
 #endif
            acSynchronize();
@@ -139,13 +142,19 @@ run_benchmark(void)

 #define NTH_PERCENTILE (0.95)
        std::sort(results.begin(), results.end(), smaller_than);
-        write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+        write_runningtimes(runningtime_path, n, results[0], results[results.size() - 1],
+                           results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);

        char percentile_path[256];
-        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n,
+                AC_DOUBLE_PRECISION ? "double" : "float",
+                GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
        write_percentiles(percentile_path, NUM_ITERS, results);

-        printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]);
+        printf("%s running time %g ms, (%dth percentile, nx = %d) \n",
+               GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep",
+               double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
+               mesh_info.int_params[AC_nx]);

        acStore(mesh);
        acQuit();
@@ -225,7 +234,8 @@ run_benchmark(void)
    return 0;
 }

-#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
+#else
+//////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA



@@ -290,8 +300,8 @@ run_benchmark(void)

        #define NTH_PERCENTILE (0.95)
        std::sort(results.begin(), results.end(), smaller_than);
-        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
-        write_percentiles(n, NUM_ITERS, results);
+        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)],
+results[int(NTH_PERCENTILE * NUM_ITERS)]); write_percentiles(n, NUM_ITERS, results);
    }

    return 0;
--- a/src/standalone/config_loader.cc
+++ b/src/standalone/config_loader.cc
@@ -79,8 +79,7 @@ parse_config(const char* path, AcMeshInfo* config)
        int idx = -1;
        if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
            config->int_params[idx] = atoi(value);
-        else if ((idx = find_str(keyword, realparam_names,
-                                 NUM_REAL_PARAM_TYPES)) >= 0)
+        else if ((idx = find_str(keyword, realparam_names, NUM_REAL_PARAM_TYPES)) >= 0)
            config->real_params[idx] = AcReal(atof(value));
    }

@@ -99,20 +98,18 @@ update_config(AcMeshInfo* config)

    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] +
-                                    config->int_params[AC_nx];
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_ny_max] = config->int_params[AC_ny] +
-                                    STENCIL_ORDER / 2;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nz_max] = config->int_params[AC_nz] +
-                                    STENCIL_ORDER / 2;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;

    // Spacing
    config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
    config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
    config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
-    config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
+    config->real_params[AC_dsmin]   = min(
+        config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));

    // Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
    config->real_params[AC_xlen] = config->real_params[AC_dsx] * config->int_params[AC_mx];
@@ -125,20 +122,19 @@ update_config(AcMeshInfo* config)

    /* Additional helper params */
    // Int helpers
-    config->int_params[AC_mxy] = config->int_params[AC_mx] *
-                                 config->int_params[AC_my];
-    config->int_params[AC_nxy] = config->int_params[AC_nx] *
-                                 config->int_params[AC_ny];
-    config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
-                                  config->int_params[AC_nz];
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];

    // Real helpers
    config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
                                        config->real_params[AC_cs_sound];

-    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma];
+    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] /
+                                       config->real_params[AC_gamma];

-    AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
+    AcReal G_CONST_CGS = AcReal(
+        6.674e-8);                   // g/cm3/s GGS definition //TODO define in a separate module
    AcReal M_sun = AcReal(1.989e33); // g solar mass

    config->real_params[AC_M_star] = config->real_params[AC_M_star] * M_sun /
@@ -147,14 +143,15 @@ update_config(AcMeshInfo* config)
                                       config->real_params[AC_unit_length]) *
                                      config->real_params[AC_unit_density]);

-    config->real_params[AC_G_CONST] = G_CONST_CGS / 
-                                      ( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) /
-                                        (config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ;
+    config->real_params[AC_G_CONST] = G_CONST_CGS / ((config->real_params[AC_unit_velocity] *
+                                                      config->real_params[AC_unit_velocity]) /
+                                                     (config->real_params[AC_unit_density] *
+                                                      config->real_params[AC_unit_length]));

-    config->real_params[AC_GM_star]  = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
+    config->real_params[AC_GM_star] = config->real_params[AC_M_star] *
+                                      config->real_params[AC_G_CONST];
    config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2) * config->real_params[AC_GM_star]));

-
    const bool print_config = true;
    if (print_config) {
        printf("###############################################################"
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -82,8 +82,7 @@ static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
 static inline vec4
 project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
 {
-    const vec4 rect = (vec4){
-        camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
+    const vec4 rect = (vec4){camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
                             camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
                             camera.scale * bbox.x, camera.scale * bbox.y};

@@ -103,9 +102,8 @@ renderer_init(const int& mx, const int& my)
    SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);

    // Setup window
-    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED,
-                              SDL_WINDOWPOS_UNDEFINED, window_width,
-                              window_height, SDL_WINDOW_SHOWN);
+    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
+                              window_width, window_height, SDL_WINDOW_SHOWN);

    // Setup SDL renderer
    renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
@@ -118,24 +116,24 @@ renderer_init(const int& mx, const int& my)
    datasurface_height = my;
    // vec drawing uses the surface of the first component, no memory issues here
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(
-            0, datasurface_width, datasurface_height, window_bpp,
-            SDL_PIXELFORMAT_RGBA8888);
+        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(0, datasurface_width, datasurface_height,
+                                                     window_bpp, SDL_PIXELFORMAT_RGBA8888);

    camera.pos   = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
-                          -.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height};
+                          -.5f * (num_tiles / tiles_per_row) * datasurface_height +
+                              .5f * datasurface_height};
    camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
                       window_height / float(datasurface_height * (num_tiles / tiles_per_row)));

    SDL_RendererInfo renderer_info;
    SDL_GetRendererInfo(renderer, &renderer_info);
-    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height);
+    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width,
+           renderer_info.max_texture_height);
    return 0;
 }

 static int
-set_pixel(const int& i, const int& j, const uint32_t& color,
-          SDL_Surface* surface)
+set_pixel(const int& i, const int& j, const uint32_t& color, SDL_Surface* surface)
 {
    uint32_t* pixels           = (uint32_t*)surface->pixels;
    pixels[i + j * surface->w] = color;
@@ -143,8 +141,7 @@ set_pixel(const int& i, const int& j, const uint32_t& color,
 }

 static int
-draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
-                   const int& tile)
+draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, const int& tile)
 {
    const float xoffset = (tile % tiles_per_row) * datasurface_width;
    const float yoffset = -(tile / tiles_per_row) * datasurface_height;
@@ -166,29 +163,23 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,

            const int idx       = AC_VTXBUF_IDX(i, j, k, mesh.info);
            const uint8_t shade = (uint8_t)(
-                255.f *
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
-                range);
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) / range);
            uint8_t color[4]            = {0, 0, 0, 255};
            color[tile % 3]             = shade;
-            const uint32_t mapped_color = SDL_MapRGBA(
-                surfaces[vertex_buffer]->format, color[0], color[1], color[2],
-                color[3]);
+            const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer]->format, color[0],
+                                                      color[1], color[2], color[3]);
            set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
        }
    }

    const float2 pos   = (float2){xoffset, yoffset};
-    const float2 bbox  = (float2){.5f * datasurface_width,
-                                 .5f * datasurface_height};
+    const float2 bbox  = (float2){.5f * datasurface_width, .5f * datasurface_height};
    const float2 wsize = (float2){float(window_width), float(window_height)};
    const vec4 rectf   = project_ortho(pos, bbox, wsize);
-    SDL_Rect rect      = (SDL_Rect){
-        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+    SDL_Rect rect      = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
                               int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};

-    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
-                                                    surfaces[vertex_buffer]);
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer]);
    SDL_RenderCopy(renderer, tex, NULL, &rect);
    SDL_DestroyTexture(tex);

@@ -196,11 +187,9 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
 }

 static int
-draw_vertex_buffer_vec(const AcMesh& mesh,
-                       const VertexBufferHandle& vertex_buffer_a,
+draw_vertex_buffer_vec(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer_a,
                       const VertexBufferHandle& vertex_buffer_b,
-                       const VertexBufferHandle& vertex_buffer_c,
-                       const int& tile)
+                       const VertexBufferHandle& vertex_buffer_c, const int& tile)
 {
    const float xoffset = (tile % tiles_per_row) * datasurface_width;
    const float yoffset = -(tile / tiles_per_row) * datasurface_height;
@@ -215,14 +204,12 @@ draw_vertex_buffer_vec(const AcMesh& mesh,
            min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
                model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
    */
-    const float maxx = float(
-        max(acReduceScal(RTYPE_MAX, vertex_buffer_a),
-            max(acReduceScal(RTYPE_MAX, vertex_buffer_b),
-                acReduceScal(RTYPE_MAX, vertex_buffer_c))));
-    const float minn = float(
-        min(acReduceScal(RTYPE_MIN, vertex_buffer_a),
-            min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
-                acReduceScal(RTYPE_MIN, vertex_buffer_c))));
+    const float maxx  = float(max(
+        acReduceScal(RTYPE_MAX, vertex_buffer_a),
+        max(acReduceScal(RTYPE_MAX, vertex_buffer_b), acReduceScal(RTYPE_MAX, vertex_buffer_c))));
+    const float minn  = float(min(
+        acReduceScal(RTYPE_MIN, vertex_buffer_a),
+        min(acReduceScal(RTYPE_MIN, vertex_buffer_b), acReduceScal(RTYPE_MIN, vertex_buffer_c))));
    const float range = fabsf(maxx - minn);
    const float mid   = maxx - .5f * range;

@@ -233,34 +220,25 @@ draw_vertex_buffer_vec(const AcMesh& mesh,

            const int idx   = AC_VTXBUF_IDX(i, j, k, mesh.info);
            const uint8_t r = (uint8_t)(
-                255.f *
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
-                range);
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) / range);
            const uint8_t g = (uint8_t)(
-                255.f *
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
-                range);
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) / range);
            const uint8_t b = (uint8_t)(
-                255.f *
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) /
-                range);
-            const uint32_t mapped_color = SDL_MapRGBA(
-                surfaces[vertex_buffer_a]->format, r, g, b, 255);
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / range);
+            const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer_a]->format, r, g, b,
+                                                      255);
            set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
        }
    }

    const float2 pos   = (float2){xoffset, yoffset};
-    const float2 bbox  = (float2){.5f * datasurface_width,
-                                 .5f * datasurface_height};
+    const float2 bbox  = (float2){.5f * datasurface_width, .5f * datasurface_height};
    const float2 wsize = (float2){float(window_width), float(window_height)};
    const vec4 rectf   = project_ortho(pos, bbox, wsize);
-    SDL_Rect rect      = (SDL_Rect){
-        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+    SDL_Rect rect      = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
                               int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};

-    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
-                                                    surfaces[vertex_buffer_a]);
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer_a]);
    SDL_RenderCopy(renderer, tex, NULL, &rect);
    SDL_DestroyTexture(tex);

@@ -272,13 +250,11 @@ renderer_draw(const AcMesh& mesh)
 {
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
        draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
-    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
-                           NUM_VTXBUF_HANDLES);
+    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, NUM_VTXBUF_HANDLES);

    // Drawing done, present
    SDL_RenderPresent(renderer);
-    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b,
-                           color_bg.a);
+    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, color_bg.a);
    SDL_RenderClear(renderer);

    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -404,13 +380,13 @@ run_renderer(void)

 /* Step the simulation */
 #if 1
-        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
-                                        VTXBUF_UUZ);
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
        const AcReal dt   = host_timestep(umax, mesh_info);
        acIntegrate(dt);
 #else
        ModelMesh* model_mesh = modelmesh_create(mesh->info);
-        const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+        const AcReal umax     = AcReal(
+            model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
        const AcReal dt = host_timestep(umax, mesh_info);
        acmesh_to_modelmesh(*mesh, model_mesh);
        model_rk3(dt, model_mesh);
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -63,8 +63,8 @@ print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
 // Write all setting info into a separate ascii file. This is done to guarantee
 // that we have the data specifi information in the thing, even though in
 // principle these things are in the astaroth.conf.
-static inline 
-void write_mesh_info(const AcMeshInfo* config)
+static inline void
+write_mesh_info(const AcMeshInfo* config)
 {

    FILE* infotxt;
@@ -111,13 +111,11 @@ void write_mesh_info(const AcMeshInfo* config)
    fclose(infotxt);
 }

-
 // This funtion writes a run state into a set of C binaries. For the sake of
 // accuracy, all floating point numbers are to be saved in long double precision
 // regardless of the choise of accuracy during runtime.
 static inline void
-save_mesh(const AcMesh &save_mesh, const int step, 
-          const AcReal t_step)
+save_mesh(const AcMesh& save_mesh, const int step, const AcReal t_step)
 {
    FILE* save_ptr;

@@ -152,11 +150,8 @@ save_mesh(const AcMesh &save_mesh, const int step,
        }
        fclose(save_ptr);
    }
-
 }

-
-
 // This function prints out the diagnostic values to std.out and also saves and
 // appends an ascii file to contain all the result.
 static inline void
@@ -174,11 +169,10 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
    // MV: The ordering in the earlier version was wrong in terms of variable
    // MV: name and its diagnostics.
    printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
-    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
-           double(buf_min), double(buf_rms), double(buf_max));
-    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), 
-           double(buf_min), double(buf_rms), double(buf_max));
-    
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", double(buf_min),
+           double(buf_rms), double(buf_max));
+    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), double(buf_min),
+            double(buf_rms), double(buf_max));

    // Calculate rms, min and max from the variables as scalars
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -213,7 +207,6 @@ run_simulation(void)
    acInit(mesh_info);
    acLoad(*mesh);

-
    FILE* diag_file;
    diag_file = fopen("timeseries.ts", "a");
    // TODO Get time from earlier state.
@@ -222,7 +215,8 @@ run_simulation(void)
    // Generate the title row.
    fprintf(diag_file, "step  t_step  dt  uu_total_min  uu_total_rms  uu_total_max  ");
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]);
+        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i],
+                vtxbuf_names[i]);
    }

    fprintf(diag_file, "\n");
@@ -243,8 +237,7 @@ run_simulation(void)

    /* Step the simulation */
    for (int i = 1; i < max_steps; ++i) {
-        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
-                                        VTXBUF_UUZ);
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
        const AcReal dt   = host_timestep(umax, mesh_info);
        acIntegrate(dt);

@@ -266,7 +259,6 @@ run_simulation(void)
                which can be very useful when observing behaviour of turbulent
                simulations. (TODO)
            */
-
        }

        /* Save the simulation state and print diagnostics */
@@ -300,9 +292,7 @@ run_simulation(void)
            save_mesh(*mesh, i, t_step);

            bin_crit_t += bin_save_t;
-
        }
-
    }

    //////Save the final snapshot
@@ -318,25 +308,3 @@ run_simulation(void)

    return 0;
 }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/src/standalone/timer_hires.h
+++ b/src/standalone/timer_hires.h
@@ -52,8 +52,7 @@ timer_diff_nsec(const Timer start)
 {
    Timer end;
    timer_reset(&end);
-    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l +
-                      (end.tv_nsec - start.tv_nsec);
+    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + (end.tv_nsec - start.tv_nsec);
    return diff;
 }