diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 66d4c9d..fc8eb5d 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -316,9 +316,13 @@ traverse(const ASTNode* node)
             if (symbol_table[i].type_qualifier == IN) {
                 printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier),
                        symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
-            } else if (symbol_table[i].type_qualifier == OUT) {
-                printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
-                //printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+            }
+            else if (symbol_table[i].type_qualifier == OUT) {
+                printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier),
+                       symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+                // printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n",
+                // translate(symbol_table[i].type_specifier), symbol_table[i].identifier,
+                // inout_name_prefix, symbol_table[i].identifier);
             }
         }
     }
@@ -326,8 +330,7 @@ traverse(const ASTNode* node)
     // Preprocessed parameter boilerplate
     if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
         inside_preprocessed = true;
-    static const char
-        preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
+    static const char preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
     if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
         printf("%s ", preprocessed_parameter_boilerplate);
     // BOILERPLATE END////////////////////////////////////////////////////////
@@ -343,7 +346,6 @@ traverse(const ASTNode* node)
     if (node->type == NODE_FUNCTION_DECLARATION)
         inside_function_declaration = false;
 
-
     // If the node is a subscript expression and the expression list inside it is not empty
     if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
         printf("IDX(");
@@ -354,7 +356,7 @@ traverse(const ASTNode* node)
         if (handle >= 0) { // The variable exists in the symbol table
             const Symbol* symbol = &symbol_table[handle];
 
-            //if (symbol->type_qualifier == OUT) {
+            // if (symbol->type_qualifier == OUT) {
             //    printf("%s%s", inout_name_prefix, symbol->identifier);
             //}
             if (symbol->type_qualifier == UNIFORM) {
@@ -394,14 +396,16 @@ traverse(const ASTNode* node)
     // Postfix logic  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
     // If the node is a subscript expression and the expression list inside it is not empty
     if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
-        printf(")");    // Closing bracket of IDX()
+        printf(")"); // Closing bracket of IDX()
 
     // Generate writeback boilerplate for OUT fields
     if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
         for (int i = 0; i < num_symbols; ++i) {
             if (symbol_table[i].type_qualifier == OUT) {
-                printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
-                //printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
+                printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier,
+                       symbol_table[i].identifier);
+                // printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n",
+                // inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
             }
         }
     }
@@ -486,8 +490,8 @@ generate_preprocessed_structures(void)
 
     for (int i = 0; i < num_symbols; ++i) {
         if (symbol_table[i].type_qualifier == PREPROCESSED)
-            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", symbol_table[i].identifier,
-                   symbol_table[i].identifier);
+            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n",
+                   symbol_table[i].identifier, symbol_table[i].identifier);
     }
     printf("return data;\n");
     printf("}\n");
diff --git a/include/astaroth.h b/include/astaroth.h
index 14eed0c..10b883c 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -41,7 +41,6 @@ extern "C" {
 #include <stdlib.h>       // size_t
 #include <vector_types.h> // CUDA vector types (float4, etc)
 
-
 /*
  * =============================================================================
  * Flags for auto-optimization
@@ -59,7 +58,6 @@ extern "C" {
 #define NUM_ITERATIONS (10)
 #define WARP_SIZE (32)
 
-
 /*
  * =============================================================================
  * Compile-time constants used during simulation (user definable)
@@ -75,7 +73,8 @@ extern "C" {
 
 // L-prefix inherited from the old Astaroth, no idea what it means
 // MV: L means a Logical switch variale, something having true of false value.
-#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+// Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+#define LFORCING (0)
 #define LINDUCTION (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
@@ -258,28 +257,16 @@ typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
  * Reduction types
  * =============================================================================
  */
-typedef enum {
-    RTYPE_MAX,
-    RTYPE_MIN,
-    RTYPE_RMS,
-    RTYPE_RMS_EXP,
-    NUM_REDUCTION_TYPES
-} ReductionType;
+typedef enum { RTYPE_MAX, RTYPE_MIN, RTYPE_RMS, RTYPE_RMS_EXP, NUM_REDUCTION_TYPES } ReductionType;
 
 /*
  * =============================================================================
  * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
  * =============================================================================
  */
-typedef enum {
-    AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
-    NUM_INT_PARAM_TYPES
-} AcIntParam;
+typedef enum { AC_FOR_INT_PARAM_TYPES(AC_GEN_ID), NUM_INT_PARAM_TYPES } AcIntParam;
 
-typedef enum {
-    AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
-    NUM_REAL_PARAM_TYPES
-} AcRealParam;
+typedef enum { AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID), NUM_REAL_PARAM_TYPES } AcRealParam;
 
 extern const char* intparam_names[];  // Defined in astaroth.cu
 extern const char* realparam_names[]; // Defined in astaroth.cu
@@ -294,9 +281,7 @@ typedef struct {
  * Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
  * =============================================================================
  */
-typedef enum {
-    AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
-} VertexBufferHandle;
+typedef enum { AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES } VertexBufferHandle;
 
 extern const char* vtxbuf_names[]; // Defined in astaroth.cu
 
@@ -316,22 +301,20 @@ typedef struct {
     AcMeshInfo info;
 } AcMesh;
 
-#define AC_VTXBUF_SIZE(mesh_info)                                              \
-    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *      \
+#define AC_VTXBUF_SIZE(mesh_info)                                                                  \
+    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *                          \
               mesh_info.int_params[AC_mz]))
 
-#define AC_VTXBUF_SIZE_BYTES(mesh_info)                                        \
-    (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
+#define AC_VTXBUF_SIZE_BYTES(mesh_info) (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
 
-#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                   \
-    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] *               \
-     mesh_info.int_params[AC_nz])
+#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                                       \
+    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * mesh_info.int_params[AC_nz])
 
-#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                             \
+#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                                                 \
     (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
 
-#define AC_VTXBUF_IDX(i, j, k, mesh_info)                                      \
-    ((i) + (j)*mesh_info.int_params[AC_mx] +                                   \
+#define AC_VTXBUF_IDX(i, j, k, mesh_info)                                                          \
+    ((i) + (j)*mesh_info.int_params[AC_mx] +                                                       \
      (k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my])
 
 /*
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 58b551b..4692df3 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -28,33 +28,24 @@
 #include "errchk.h"
 
 #include "device.cuh"
-#include "math_utils.h" // sum for reductions
+#include "math_utils.h"               // sum for reductions
 #include "standalone/config_loader.h" // update_config
 
-const char* intparam_names[]      = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]     = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]        = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+const char* intparam_names[]  = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* vtxbuf_names[]    = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 
-
-static const int MAX_NUM_DEVICES = 32;
-static int num_devices = 1;
+static const int MAX_NUM_DEVICES       = 32;
+static int num_devices                 = 1;
 static Device devices[MAX_NUM_DEVICES] = {};
 
 static Grid
 createGrid(const AcMeshInfo& config)
 {
     Grid grid;
-    grid.m = (int3) {
-        config.int_params[AC_mx],
-        config.int_params[AC_my],
-        config.int_params[AC_mz]
-    };
 
-    grid.n = (int3) {
-        config.int_params[AC_nx],
-        config.int_params[AC_ny],
-        config.int_params[AC_nz]
-    };
+    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
+    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
 
     return grid;
 }
@@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k)
 static int3
 gridIdx3d(const Grid& grid, const int idx)
 {
-    return (int3){idx % grid.m.x,
-                 (idx % (grid.m.x * grid.m.y)) / grid.m.x,
+    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
                   idx / (grid.m.x * grid.m.y)};
 }
 
@@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config)
     ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
     ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
 
-    printf("Grid m "); printInt3(grid.m); printf("\n");
-    printf("Grid n "); printInt3(grid.n); printf("\n");
+    // clang-format off
+    printf("Grid m ");   printInt3(grid.m);    printf("\n");
+    printf("Grid n ");   printInt3(grid.n);    printf("\n");
     printf("Subrid m "); printInt3(subgrid.m); printf("\n");
     printf("Subrid n "); printInt3(subgrid.n); printf("\n");
+    // clang-format on
 
     // Initialize the devices
     for (int i = 0; i < num_devices; ++i) {
@@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
         */
         if (db.z >= da.z) {
             const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
-            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
+            const int3 da_local  = (int3){
+                da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
             copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
         }
         printf("\n");
@@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
         */
         if (db.z >= da.z) {
             const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
-            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
+            const int3 da_local  = (int3){
+                da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local);
+            // printf("\n");
             copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
         }
         printf("\n");
@@ -262,10 +258,9 @@ acStore(AcMesh* host_mesh)
 AcResult
 acIntegrateStep(const int& isubstep, const AcReal& dt)
 {
-    const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
-    const int3 end   = (int3){STENCIL_ORDER/2 + subgrid.n.x,
-                              STENCIL_ORDER/2 + subgrid.n.y,
-                              STENCIL_ORDER/2 + subgrid.n.z};
+    const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2};
+    const int3 end   = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y,
+                            STENCIL_ORDER / 2 + subgrid.n.z};
     for (int i = 0; i < num_devices; ++i) {
         rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
     }
@@ -278,121 +273,125 @@ acBoundcondStep(void)
 {
     acSynchronize();
     if (num_devices == 1) {
-        boundcondStep(devices[0], STREAM_PRIMARY,
-                      (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
-    } else {
+        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0},
+                      (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
+    }
+    else {
         // Local boundary conditions
         for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
+            const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE
             const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
             boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
         }
 
-/*
-// ===MIIKKANOTE START==========================================
-%JP: The old way for computing boundary conditions conflicts with the
-way we have to do things with multiple GPUs.
+        /*
+        // ===MIIKKANOTE START==========================================
+        %JP: The old way for computing boundary conditions conflicts with the
+        way we have to do things with multiple GPUs.
 
-The older approach relied on unified memory, which represented the whole
-memory area as one huge mesh instead of several smaller ones. However, unified memory
-in its current state is more meant for quick prototyping when performance is not an issue.
-Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
-when managing the memory explicitly.
+        The older approach relied on unified memory, which represented the whole
+        memory area as one huge mesh instead of several smaller ones. However, unified memory
+        in its current state is more meant for quick prototyping when performance is not an issue.
+        Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
+        than when managing the memory explicitly.
 
-In this new approach, I have simplified the multi- and single-GPU layers significantly.
-Quick rundown:
-	New struct: Grid. There are two global variables, "grid" and "subgrid", which
-	contain the extents of the whole simulation domain and the decomposed grids, respectively.
-	To simplify thing, we require that each GPU is assigned the same amount of work,
-	therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
-	to work with.
+        In this new approach, I have simplified the multi- and single-GPU layers significantly.
+        Quick rundown:
+                New struct: Grid. There are two global variables, "grid" and "subgrid", which
+                contain the extents of the whole simulation domain and the decomposed grids,
+        respectively. To simplify thing, we require that each GPU is assigned the same amount of
+        work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
+        work with.
 
-	The whole simulation domain is decomposed with respect to the z dimension.
-	For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
-	contain (nx, ny, nz / num_devices) vertices.
+                The whole simulation domain is decomposed with respect to the z dimension.
+                For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+                contain (nx, ny, nz / num_devices) vertices.
 
-	An local index (i, j, k) in some subgrid can be mapped to the global grid with
-		global idx = (i, j, k + device_id * subgrid.n.z)
+                An local index (i, j, k) in some subgrid can be mapped to the global grid with
+                        global idx = (i, j, k + device_id * subgrid.n.z)
 
-Terminology:
-	- Single-GPU function: a function defined on the single-GPU layer (device.cu)
+        Terminology:
+                - Single-GPU function: a function defined on the single-GPU layer (device.cu)
 
-Changes required to this commented code block:
-	- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
-	  instead. Same holds for any complex index calculations. Instead, the local coordinates
-  	  should be passed as an int3 type without having to consider how the data is actually
-	  laid out in device memory
-	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
-	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
-	  are stored in a global array "devices[num_devices]".
-	- Every single-GPU function is executed asynchronously by default such that we
-	  can optimize Astaroth by executing memory transactions concurrently with computation.
-	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
-	  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
-	  as a parameter and commands executing in different streams can be processed
-	  in parallel/concurrently.
+        Changes required to this commented code block:
+                - The thread block dimensions (tpb) are no longer passed to the kernel here but in
+        device.cu instead. Same holds for any complex index calculations. Instead, the local
+        coordinates should be passed as an int3 type without having to consider how the data is
+        actually laid out in device memory
+                - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
+        handle of type "Device" which should be passed to single-GPU functions. In this file, all
+        devices are stored in a global array "devices[num_devices]".
+                - Every single-GPU function is executed asynchronously by default such that we
+                  can optimize Astaroth by executing memory transactions concurrently with
+        computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
+                  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+                  as a parameter and commands executing in different streams can be processed
+                  in parallel/concurrently.
 
 
-Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+        Note on periodic boundaries (might be helpful when implementing other boundary conditions):
 
-	With multiple GPUs, periodic boundary conditions applied on indices ranging from
+                With multiple GPUs, periodic boundary conditions applied on indices ranging from
 
-		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
+                        (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
+        STENCIL_ORDER/2)
 
-	on a single device are "local", in the sense that they can be computed without having
-	to exchange data with neighboring GPUs. Special care is needed only for transferring
-	the data to the fron and back plates outside this range. In the solution we use here,
-	we solve the local boundaries first, and then just exchange the front and back plates
-	in a "ring", like so
-				device_id
-		    (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+                on a single device are "local", in the sense that they can be computed without
+        having to exchange data with neighboring GPUs. Special care is needed only for transferring
+                the data to the fron and back plates outside this range. In the solution we use
+        here, we solve the local boundaries first, and then just exchange the front and back plates
+                in a "ring", like so
+                                        device_id
+                            (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
 
 
-// ======MIIKKANOTE END==========================================
+        // ======MIIKKANOTE END==========================================
 
-<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
-                                                          moved into device.cu, function boundCondStep()
-                                                          In astaroth.cu, we use acBoundcondStep()
-                                                          just to distribute the work and manage
-                                                          communication between GPUs.
+        <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
+                                                                  moved into device.cu, function
+        boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and
+        manage communication between GPUs.
 
-    printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
+            printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y,
+        best_dims.z, double(best_time) / NUM_ITERATIONS);
 
-    exit(0);
-    #else
+            exit(0);
+            #else
 
 
-        const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
+                const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
 
-        const int3 start = (int3){0, 0, device_id * depth};
-        const int3 end = (int3){mesh_info.int_params[AC_mx],
-                                mesh_info.int_params[AC_my],
-                                min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
+                const int3 start = (int3){0, 0, device_id * depth};
+                const int3 end = (int3){mesh_info.int_params[AC_mx],
+                                        mesh_info.int_params[AC_my],
+                                        min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
 
-        const dim3 tpb(8,2,8);
+                const dim3 tpb(8,2,8);
 
-        // TODO uses the default stream currently
-        if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
-            wedge_boundconds(0, tpb, start, end, d_buffer);
-        } else {
-            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-                periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
-<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-*/
+                // TODO uses the default stream currently
+                if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
+                    wedge_boundconds(0, tpb, start, end, d_buffer);
+                } else {
+                    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+                        periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
+        <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+        */
         // Exchange halos
         for (int i = 0; i < num_devices; ++i) {
-            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
+            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER / 2;
             // ...|ooooxxx|... -> xxx|ooooooo|...
             {
-                const int3 src = (int3) {0, 0, subgrid.n.z};
-                const int3 dst = (int3) {0, 0, 0};
-                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
+                const int3 src = (int3){0, 0, subgrid.n.z};
+                const int3 dst = (int3){0, 0, 0};
+                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src,
+                                       devices[(i + 1) % num_devices], dst, num_vertices);
             }
             // ...|ooooooo|xxx <- ...|xxxoooo|...
             {
-                const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
-                const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
-                copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
+                const int3 src = (int3){0, 0, STENCIL_ORDER / 2};
+                const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z};
+                copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src,
+                                       devices[i], dst, num_vertices);
             }
         }
     }
@@ -427,26 +426,28 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
     for (int i = 1; i < n; ++i) {
         if (rtype == RTYPE_MAX) {
             res = max(res, results[i]);
-        } else if (rtype == RTYPE_MIN) {
+        }
+        else if (rtype == RTYPE_MIN) {
             res = min(res, results[i]);
-        } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        }
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
             res = sum(res, results[i]);
-        } else {
+        }
+        else {
             ERROR("Invalid rtype");
         }
     }
 
     if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
         const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res = sqrt(inv_n * res);
+        res                = sqrt(inv_n * res);
     }
 
     return res;
 }
 
 AcReal
-acReduceScal(const ReductionType& rtype,
-             const VertexBufferHandle& vtxbuffer_handle)
+acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
 {
     AcReal results[num_devices];
     for (int i = 0; i < num_devices; ++i) {
@@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype,
 }
 
 AcReal
-acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
-            const VertexBufferHandle& b, const VertexBufferHandle& c)
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
+            const VertexBufferHandle& c)
 {
     AcReal results[num_devices];
     for (int i = 0; i < num_devices; ++i) {
diff --git a/src/core/device.cu b/src/core/device.cu
index 1ef2648..6df10f7 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -36,7 +36,7 @@ typedef struct {
 __constant__ AcMeshInfo d_mesh_info;
 __constant__ int3 d_multigpu_offset;
 __constant__ Grid globalGrid;
-#define DCONST_INT(X)  (d_mesh_info.int_params[X])
+#define DCONST_INT(X) (d_mesh_info.int_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
@@ -76,46 +76,46 @@ printDeviceInfo(const Device device)
     printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
     printf("    Stream processors: %d\n", props.multiProcessorCount);
     printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
-    printf("    Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    printf(
+        "    Compute mode: %d\n",
+        (int)props
+            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
     // Memory
     printf("  Global memory\n");
     printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
     printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
     printf("    Peak Memory Bandwidth (GiB/s): %f\n",
-           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
-               (8. * 1024. * 1024. * 1024.));
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
     printf("    ECC enabled: %d\n", props.ECCEnabled);
     // Memory usage
     size_t free_bytes, total_bytes;
     cudaMemGetInfo(&free_bytes, &total_bytes);
     const size_t used_bytes = total_bytes - free_bytes;
-    printf("    Total global mem: %.2f GiB\n",
-           props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
     printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory free (GiB): %.2f\n",
-           free_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory total (GiB): %.2f\n",
-           total_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
     printf("  Caches\n");
     printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
     printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
     printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
     printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
-    printf("    Shared mem per block: %ld KiB\n",
-           props.sharedMemPerBlock / (1024));
+    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
     printf("  Other\n");
     printf("    Warp size: %d\n", props.warpSize);
     // printf("    Single to double perf. ratio: %dx\n",
     // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
     // versions
-    printf("    Stream priorities supported: %d\n",
-           props.streamPrioritiesSupported);
+    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
     printf("--------------------------------------------------\n");
 
     return AC_SUCCESS;
 }
 
-static __global__ void dummy_kernel(void) {}
+static __global__ void
+dummy_kernel(void)
+{
+}
 
 AcResult
 createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
@@ -124,10 +124,10 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
     cudaDeviceReset();
 
     // Create Device
-    struct device_s* device = (struct device_s*) malloc(sizeof(*device));
+    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
     ERRCHK_ALWAYS(device);
 
-    device->id = id;
+    device->id           = id;
     device->local_config = device_config;
 
     // Check that the code was compiled for the proper GPU architecture
@@ -150,15 +150,14 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
         ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
         ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
     }
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
-                                  AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMalloc(&device->reduce_scratchpad, AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
     ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
 
     // Device constants
     ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
                                           cudaMemcpyHostToDevice));
 
-
     // Multi-GPU offset. This is used to compute globalVertexIdx.
     // Might be better to calculate this in astaroth.cu instead of here, s.t.
     // everything related to the decomposition is limited to the multi-GPU layer
@@ -166,7 +165,6 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
     ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
                                           sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
 
-
     printf("Created device %d (%p)\n", device->id, device);
     *device_handle = device;
     return AC_SUCCESS;
@@ -211,53 +209,44 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp
 {
     cudaSetDevice(device->id);
 
-    const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
-                               device->local_config.int_params[AC_ny_min],
-                               device->local_config.int_params[AC_nz_min]
-    };
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
+                              device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_nz_min]};
 
-    const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
-                             device->local_config.int_params[AC_ny_max],
-                             device->local_config.int_params[AC_nz_max]
-    };
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
+                            device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_nz_max]};
 
-    *result = reduce_scal(device->streams[stream_type], rtype,
-                          start, end, device->vba.in[vtxbuf_handle],
-                          device->reduce_scratchpad, device->reduce_result);
+    *result = reduce_scal(device->streams[stream_type], rtype, start, end,
+                          device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
+                          device->reduce_result);
     return AC_SUCCESS;
 }
 
 AcResult
-reduceVec(const Device device, const StreamType stream_type,
-          const ReductionType rtype,
-          const VertexBufferHandle vtxbuf0,
-          const VertexBufferHandle vtxbuf1,
-          const VertexBufferHandle vtxbuf2,
-          AcReal* result)
+reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
+          const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+          const VertexBufferHandle vtxbuf2, AcReal* result)
 {
     cudaSetDevice(device->id);
 
-    const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
-                               device->local_config.int_params[AC_ny_min],
-                               device->local_config.int_params[AC_nz_min]
-    };
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
+                              device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_nz_min]};
 
-    const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
-                             device->local_config.int_params[AC_ny_max],
-                             device->local_config.int_params[AC_nz_max]
-    };
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
+                            device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_nz_max]};
 
-    *result = reduce_vec(device->streams[stream_type], rtype, start, end,
-                          device->vba.in[vtxbuf0],
-                          device->vba.in[vtxbuf1],
-                          device->vba.in[vtxbuf2],
-                          device->reduce_scratchpad, device->reduce_result);
+    *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
+                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
+                         device->reduce_scratchpad, device->reduce_result);
     return AC_SUCCESS;
 }
 
 AcResult
-rkStep(const Device device, const StreamType stream_type, const int step_number,
-       const int3& start, const int3& end, const AcReal dt)
+rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
+       const int3& end, const AcReal dt)
 {
     cudaSetDevice(device->id);
     rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
@@ -270,65 +259,62 @@ synchronize(const Device device, const StreamType stream_type)
     cudaSetDevice(device->id);
     if (stream_type == STREAM_ALL) {
         cudaDeviceSynchronize();
-    } else {
+    }
+    else {
         cudaStreamSynchronize(device->streams[stream_type]);
     }
     return AC_SUCCESS;
 }
 
 static AcResult
-loadWithOffset(const Device device, const StreamType stream_type,
-               const AcReal* src, const size_t bytes, AcReal* dst)
+loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
+               const size_t bytes, AcReal* dst)
 {
     cudaSetDevice(device->id);
-    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
-                                device->streams[stream_type]));
+    ERRCHK_CUDA(
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
     return AC_SUCCESS;
 }
 
 static AcResult
-storeWithOffset(const Device device, const StreamType stream_type,
-                const AcReal* src, const size_t bytes, AcReal* dst)
+storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
+                const size_t bytes, AcReal* dst)
 {
     cudaSetDevice(device->id);
-    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
-                                device->streams[stream_type]));
+    ERRCHK_CUDA(
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
     return AC_SUCCESS;
 }
 
 AcResult
-copyMeshToDevice(const Device device, const StreamType stream_type,
-                 const AcMesh& host_mesh, const int3& src, const int3& dst,
-                 const int num_vertices)
+copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
+                 const int3& src, const int3& dst, const int num_vertices)
 {
     const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
     const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
-                       &device->vba.in[i][dst_idx]);
+        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
+                       num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
     }
     return AC_SUCCESS;
 }
 
 AcResult
-copyMeshToHost(const Device device, const StreamType stream_type,
-               const int3& src, const int3& dst, const int num_vertices,
-               AcMesh* host_mesh)
+copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
+               const int num_vertices, AcMesh* host_mesh)
 {
     const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
     const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
-                        num_vertices * sizeof(AcReal),
-                        &host_mesh->vertex_buffer[i][dst_idx]);
+                        num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
     }
     return AC_SUCCESS;
 }
 
 AcResult
-copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
-                       const int3& src, Device dst_device, const int3& dst,
-                       const int num_vertices)
+copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
+                       Device dst_device, const int3& dst, const int num_vertices)
 {
     cudaSetDevice(src_device->id);
     const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
@@ -348,7 +334,7 @@ swapBuffers(const Device device)
 {
     cudaSetDevice(device->id);
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        AcReal* tmp     = device->vba.in[i];
+        AcReal* tmp        = device->vba.in[i];
         device->vba.in[i]  = device->vba.out[i];
         device->vba.out[i] = tmp;
     }
@@ -364,8 +350,8 @@ loadDeviceConstant(const Device device, const AcIntParam param, const int value)
     // Therefore we have to obfuscate the code a bit and compute the offset address before
     // invoking cudaMemcpyToSymbol.
     const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
-                                          offset, cudaMemcpyHostToDevice));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
     return AC_SUCCESS;
 }
 
@@ -374,8 +360,8 @@ loadDeviceConstant(const Device device, const AcRealParam param, const AcReal va
 {
     cudaSetDevice(device->id);
     const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
-                                          offset, cudaMemcpyHostToDevice));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
     return AC_SUCCESS;
 }
 
@@ -383,7 +369,7 @@ AcResult
 loadGlobalGrid(const Device device, const Grid grid)
 {
     cudaSetDevice(device->id);
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid),
-                                          0, cudaMemcpyHostToDevice));
+    ERRCHK_CUDA_ALWAYS(
+        cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
     return AC_SUCCESS;
 }
diff --git a/src/core/device.cuh b/src/core/device.cuh
index 35d5209..28dbd50 100644
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -27,12 +27,14 @@
 #pragma once
 #include "astaroth.h"
 
+// clang-format off
 typedef enum {
-  STREAM_PRIMARY,
-  STREAM_SECONDARY,
-  NUM_STREAM_TYPES,
-  STREAM_ALL
+    STREAM_PRIMARY,
+    STREAM_SECONDARY,
+    NUM_STREAM_TYPES,
+    STREAM_ALL
 } StreamType;
+// clang-format on
 
 typedef struct {
     int3 m;
@@ -52,20 +54,17 @@ AcResult createDevice(const int id, const AcMeshInfo device_config, Device* devi
 AcResult destroyDevice(Device device);
 
 /** */
-AcResult boundcondStep(const Device device, const StreamType stream_type,
-                       const int3& start, const int3& end);
+AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
+                       const int3& end);
 
 /** */
 AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
                     const VertexBufferHandle vtxbuf_handle, AcReal* result);
 
 /** */
-AcResult reduceVec(const Device device, const StreamType stream_type,
-                   const ReductionType rtype,
-                   const VertexBufferHandle vec0,
-                   const VertexBufferHandle vec1,
-                   const VertexBufferHandle vec2,
-                   AcReal* result);
+AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
+                   const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                   const VertexBufferHandle vec2, AcReal* result);
 
 /** */
 AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
@@ -81,9 +80,8 @@ AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
                           const int num_vertices);
 
 /** */
-AcResult copyMeshToHost(const Device device, const StreamType stream_type,
-                        const int3& src, const int3& dst, const int num_vertices,
-                        AcMesh* host_mesh);
+AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
+                        const int3& dst, const int num_vertices, AcMesh* host_mesh);
 
 /** */
 AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh
index 4028d8f..84c5fda 100644
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -24,7 +24,7 @@
  * Detailed info.
  *
  */
- #pragma once
+#pragma once
 
 __global__ void
 kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
@@ -38,7 +38,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
     if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
         return;
 
-    //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    // if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
     //    return;
 
     // If destination index is inside the computational domain, return since
@@ -69,15 +69,15 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
     j_src += DCONST_INT(AC_ny_min);
     k_src += DCONST_INT(AC_nz_min);
 
-    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vtxbuf[dst_idx] = vtxbuf[src_idx];
+    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    vtxbuf[dst_idx]   = vtxbuf[src_idx];
 }
 
 void
 periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
 {
-    const dim3 tpb(8,2,8);
+    const dim3 tpb(8, 2, 8);
     const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
                    (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
                    (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
@@ -89,7 +89,6 @@ periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& en
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <assert.h>
 
-
 static __device__ __forceinline__ int
 IDX(const int i)
 {
@@ -120,14 +119,12 @@ create_rotz(const AcReal radians)
     return mat;
 }
 
-
 #if AC_DOUBLE_PRECISION == 0
 #define sin __sinf
 #define cos __cosf
 #define exp __expf
 #define rsqrt rsqrtf // hardware reciprocal sqrt
-#endif // AC_DOUBLE_PRECISION == 0
-
+#endif               // AC_DOUBLE_PRECISION == 0
 
 /*
 typedef struct {
@@ -155,12 +152,11 @@ first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 #elif STENCIL_ORDER == 6
     const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
 #elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
-                                   -1.0 / 280.0};
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, -1.0 / 280.0};
 #endif
 
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = 0;
+#define MID (STENCIL_ORDER / 2)
+    AcReal res = 0;
 
 #pragma unroll
     for (int i = 1; i <= MID; ++i)
@@ -175,17 +171,15 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 #if STENCIL_ORDER == 2
     const AcReal coefficients[] = {-2., 1.};
 #elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+    const AcReal coefficients[] = {-5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
 #elif STENCIL_ORDER == 6
-    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
-                                   1.0 / 90.0};
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, 1.0 / 90.0};
 #elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
-                                   8.0 / 315.0, -1.0 / 560.0};
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, 8.0 / 315.0, -1.0 / 560.0};
 #endif
 
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = coefficients[0] * pencil[MID];
+#define MID (STENCIL_ORDER / 2)
+    AcReal res = coefficients[0] * pencil[MID];
 
 #pragma unroll
     for (int i = 1; i <= MID; ++i)
@@ -196,31 +190,29 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 
 /** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
 static __device__ __forceinline__ AcReal
-cross_derivative(const AcReal* __restrict__ pencil_a,
-                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
-                 const AcReal inv_ds_b)
+cross_derivative(const AcReal* __restrict__ pencil_a, const AcReal* __restrict__ pencil_b,
+                 const AcReal inv_ds_a, const AcReal inv_ds_b)
 {
 #if STENCIL_ORDER == 2
     const AcReal coefficients[] = {0, 1.0 / 4.0};
 #elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+    const AcReal coefficients[] = {
+        0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
 #elif STENCIL_ORDER == 6
     const AcReal fac            = (1. / 720.);
-    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
-                                   2.0 * fac};
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, 2.0 * fac};
 #elif STENCIL_ORDER == 8
     const AcReal fac            = (1. / 20160.);
-    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
-                                   128. * fac, -9. * fac};
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, 128. * fac, -9. * fac};
 #endif
 
-    #define MID (STENCIL_ORDER / 2)
-    AcReal res    = AcReal(0.);
+#define MID (STENCIL_ORDER / 2)
+    AcReal res = AcReal(0.);
 
-    #pragma unroll
+#pragma unroll
     for (int i = 1; i <= MID; ++i) {
-        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
-                                  pencil_b[MID + i] - pencil_b[MID - i]);
+        res += coefficients[i] *
+               (pencil_a[MID + i] + pencil_a[MID - i] - pencil_b[MID + i] - pencil_b[MID - i]);
     }
     return res * inv_ds_a * inv_ds_b;
 }
@@ -231,7 +223,8 @@ derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
     AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                 vertexIdx.z)];
 
     return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
 }
@@ -242,7 +235,8 @@ derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
     AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
+                                 vertexIdx.z)];
 
     return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
 }
@@ -262,8 +256,7 @@ derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
         pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
                                    vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
 
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
-                            DCONST_REAL(AC_inv_dsy));
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsy));
 }
 
 static __device__ __forceinline__ AcReal
@@ -281,8 +274,7 @@ derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
         pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
                                    vertexIdx.z + STENCIL_ORDER / 2 - offset)];
 
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
-                            DCONST_REAL(AC_inv_dsz));
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsz));
 }
 
 static __device__ __forceinline__ AcReal
@@ -291,7 +283,8 @@ dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
     AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                 vertexIdx.z)];
 
     return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
 }
@@ -302,7 +295,8 @@ deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
     AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
+                                 vertexIdx.z)];
 
     return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
 }
@@ -322,8 +316,7 @@ deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
         pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
                                    vertexIdx.z + STENCIL_ORDER / 2 - offset)];
 
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
-                            DCONST_REAL(AC_inv_dsz));
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), DCONST_REAL(AC_inv_dsz));
 }
 
 static __device__ __forceinline__ AcReal
@@ -332,7 +325,8 @@ derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
     AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
+                                 vertexIdx.z + offset - STENCIL_ORDER / 2)];
 
     return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
 }
@@ -343,7 +337,8 @@ derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
     AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
     for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
+                                 vertexIdx.z + offset - STENCIL_ORDER / 2)];
 
     return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
 }
@@ -401,8 +396,7 @@ operator-(const AcReal3& a)
     return (AcReal3){-a.x, -a.y, -a.z};
 }
 
-static __host__  __device__ __forceinline__ AcReal3
-operator*(const AcReal a, const AcReal3& b)
+static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b)
 {
     return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
@@ -443,7 +437,6 @@ is_valid(const AcReal3& a)
     return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
 }
 
-
 /*
  * =============================================================================
  * Level 1 (Stencil Processing Stage)
@@ -476,8 +469,7 @@ laplace_vec(const AcReal3Data& vec)
 static __device__ __forceinline__ AcReal3
 curl(const AcReal3Data& vec)
 {
-    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
-                     gradient(vec.x).z - gradient(vec.z).x,
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, gradient(vec.x).z - gradient(vec.z).x,
                      gradient(vec.y).x - gradient(vec.x).y};
 }
 
@@ -520,7 +512,7 @@ contract(const AcMatrix& mat)
 {
     AcReal res = 0;
 
-    #pragma unroll
+#pragma unroll
     for (int i = 0; i < 3; ++i)
         res += dot(mat.row[i], mat.row[i]);
 
@@ -558,12 +550,13 @@ __constant__ AcReal forcing_phi;
 static __device__ __forceinline__ AcReal3
 forcing(const int i, const int j, const int k)
 {
-    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
-    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
-    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
-    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
-                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
-                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+    const AcReal3 k_vec = (AcReal3){
+        (i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+        (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+        (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
     AcReal inv_len = reciprocal_len(k_vec);
     if (isnan(inv_len) || isinf(inv_len))
         inv_len = 0;
@@ -571,46 +564,41 @@ forcing(const int i, const int j, const int k)
         inv_len = 2;
     const AcReal k_dot_x = dot(k_vec, forcing_vec);
 
-    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+    const AcReal waves = cos(k_dot_x) * cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
 
     return inv_len * inv_len * waves * forcing_vec;
 }
 
-
-// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values
+// in the mesh, then we will inherently lose precision
 #define LNT0 (AcReal(0.0))
 #define LNRHO0 (AcReal(0.0))
 
 #define H_CONST (AcReal(0.0))
 #define C_CONST (AcReal(0.0))
 
-
-
 template <int step_number>
 static __device__ __forceinline__ AcReal
-rk3_integrate(const AcReal state_previous, const AcReal state_current,
-              const AcReal rate_of_change, const AcReal dt)
+rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
+              const AcReal dt)
 {
     // Williamson (1980)
     const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
-                           AcReal(8. / 15.)};
-
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
 
     // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
     // access (when accessing beta[step_number-1] even when step_number >= 1)
     switch (step_number) {
-        case 0:
-            return state_current + beta[step_number + 1] * rate_of_change * dt;
-        case 1: // Fallthrough
-        case 2:
-            return state_current +
-               beta[step_number + 1] *
-                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
-                        (state_current - state_previous) +
-                    rate_of_change * dt);
-        default:
-            return NAN;
+    case 0:
+        return state_current + beta[step_number + 1] * rate_of_change * dt;
+    case 1: // Fallthrough
+    case 2:
+        return state_current +
+               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                                            (state_current - state_previous) +
+                                        rate_of_change * dt);
+    default:
+        return NAN;
     }
 }
 /*
@@ -646,13 +634,14 @@ static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
               const AcReal3 rate_of_change, const AcReal dt)
 {
-    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+    return (AcReal3){
+        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
 }
 
-#define rk3(state_previous, state_current, rate_of_change, dt)\
-rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+#define rk3(state_previous, state_current, rate_of_change, dt)                                     \
+    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
 
 /*
 template <int step_number>
@@ -708,9 +697,8 @@ read_out(const int idx, AcReal* __restrict__ field[], const int handle)
 static __device__ AcReal3
 read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 {
-    return (AcReal3) { read_out(idx, field, handle.x),
-                                       read_out(idx, field, handle.y),
-                                       read_out(idx, field, handle.z) };
+    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
+                     read_out(idx, field, handle.z)};
 }
 
 #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
@@ -718,29 +706,28 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 #define READ_OUT(handle) (read_out(idx, buffer.out, handle))
 
 // also write for clarity here also, not for the DSL
-//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the
+// function
 
-#define GEN_KERNEL_PARAM_BOILERPLATE \
-        const int3 start, const int3 end, VertexBufferArray buffer
+#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
 
-#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
-        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
-                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
-                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
-        const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
-                                            d_multigpu_offset.y + vertexIdx.y, \
-                                            d_multigpu_offset.z + vertexIdx.z}; \
-        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
-            return;\
-\
-\
-        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
-               vertexIdx.z < DCONST_INT(AC_nz_max));\
-\
-        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
-               vertexIdx.z >= DCONST_INT(AC_nz_min));\
-\
-        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
+    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
+                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
+                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
+    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
+                                        d_multigpu_offset.y + vertexIdx.y,                         \
+                                        d_multigpu_offset.z + vertexIdx.z};                        \
+    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
+        return;                                                                                    \
+                                                                                                   \
+    assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&           \
+           vertexIdx.z < DCONST_INT(AC_nz_max));                                                   \
+                                                                                                   \
+    assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&         \
+           vertexIdx.z >= DCONST_INT(AC_nz_min));                                                  \
+                                                                                                   \
+    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
 
 #include "stencil_process.cuh"
 
@@ -757,33 +744,31 @@ randf(void)
 }
 
 AcResult
-rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
-               const AcReal dt, VertexBufferArray* buffer)
+rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start,
+               const int3& end, const AcReal dt, VertexBufferArray* buffer)
 {
     const dim3 tpb(32, 1, 4);
-    /////////////////// Forcing
-    #if LFORCING
+/////////////////// Forcing
+#if LFORCING
     const AcReal ff_scale = AcReal(.2);
-    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
-    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
-    const AcMatrix rotz = create_rotz(radians);
-    ff = mul(rotz, ff);
+    static AcReal3 ff     = ff_scale * (AcReal3){1, 0, 0};
+    const AcReal radians  = randf() * AcReal(2 * M_PI) / 360 / 8;
+    const AcMatrix rotz   = create_rotz(radians);
+    ff                    = mul(rotz, ff);
     cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
 
-    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
-    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
-    #endif // LFORCING
+    const AcReal ff_phi = AcReal(M_PI); // AcReal(2 * M_PI) * randf();
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice,
+                            stream);
+#endif // LFORCING
     //////////////////////////
 
     const int nx = end.x - start.x;
     const int ny = end.y - start.y;
     const int nz = end.z - start.z;
 
-    const dim3 bpg(
-        (unsigned int)ceil(nx / AcReal(tpb.x)),
-        (unsigned int)ceil(ny / AcReal(tpb.y)),
-        (unsigned int)ceil(nz / AcReal(tpb.z)));
-
+    const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)),
+                   (unsigned int)ceil(nz / AcReal(tpb.z)));
 
     if (step_number == 0)
         solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
@@ -796,7 +781,6 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
     return AC_SUCCESS;
 }
 
-
 ////////////////REDUCE///////////////////////////
 #include "src/core/math_utils.h" // is_power_of_two
 
@@ -848,22 +832,19 @@ template <FilterFunc filter>
 __global__ void
 kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
 {
-    const int3 src_idx = (int3) {
-        start.x + threadIdx.x + blockIdx.x * blockDim.x,
-        start.y + threadIdx.y + blockIdx.y * blockDim.y,
-        start.z + threadIdx.z + blockIdx.z * blockDim.z
-    };
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
 
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z; //MV: Added this because it was undefined
-    const int3 dst_idx = (int3) {
-        threadIdx.x + blockIdx.x * blockDim.x,
-        threadIdx.y + blockIdx.y * blockDim.y,
-        threadIdx.z + blockIdx.z * blockDim.z
-    };
+    const int nx       = end.x - start.x;
+    const int ny       = end.y - start.y;
+    const int nz       = end.z - start.z; // MV: Added this because it was undefined
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
 
-    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
+    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
+           src_idx.z < DCONST_INT(AC_nz_max));
     assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
     assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
 
@@ -872,31 +853,27 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end,
 
 template <FilterFuncVec filter>
 __global__ void
-kernel_filter_vec(const __restrict__ AcReal* src0,
-                  const __restrict__ AcReal* src1,
-                  const __restrict__ AcReal* src2,
-                  const int3 start, const int3 end, AcReal* dst)
+kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
+                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
 {
-    const int3 src_idx = (int3) {
-        start.x + threadIdx.x + blockIdx.x * blockDim.x,
-        start.y + threadIdx.y + blockIdx.y * blockDim.y,
-        start.z + threadIdx.z + blockIdx.z * blockDim.z
-    };
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
 
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z; //MV: Added this because it was undefined
-    const int3 dst_idx = (int3) {
-        threadIdx.x + blockIdx.x * blockDim.x,
-        threadIdx.y + blockIdx.y * blockDim.y,
-        threadIdx.z + blockIdx.z * blockDim.z
-    };
+    const int nx       = end.x - start.x;
+    const int ny       = end.y - start.y;
+    const int nz       = end.z - start.z; // MV: Added this because it was undefined
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
 
-    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
+    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
+           src_idx.z < DCONST_INT(AC_nz_max));
     assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
     assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
 
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
+        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
 }
 
 template <ReduceFunc reduce>
@@ -908,7 +885,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
     extern __shared__ AcReal smem[];
     if (idx < num_elems) {
         smem[threadIdx.x] = scratchpad[idx];
-    } else {
+    }
+    else {
         smem[threadIdx.x] = NAN;
     }
     __syncthreads();
@@ -930,9 +908,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
 
 template <ReduceFunc reduce>
 __global__ void
-kernel_reduce_block(const __restrict__ AcReal* scratchpad,
-                    const int num_blocks, const int block_size,
-                    AcReal* result)
+kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
+                    const int block_size, AcReal* result)
 {
     const int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if (idx != 0) {
@@ -946,23 +923,19 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad,
     *result = res;
 }
 
-
 AcReal
-reduce_scal(const cudaStream_t stream, const ReductionType rtype,
-            const int3& start, const int3& end,
-            const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
+reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
 {
-    const unsigned nx = end.x - start.x;
-    const unsigned ny = end.y - start.y;
-    const unsigned nz = end.z - start.z;
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
     const unsigned num_elems = nx * ny * nz;
 
     const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter(
-        (unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-        (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-        (unsigned int)ceil(nz / AcReal(tpb_filter.z))
-    );
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
 
     const int tpb_reduce = 128;
     const int bpg_reduce = num_elems / tpb_reduce;
@@ -974,22 +947,38 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
     ERRCHK(nx * ny * nz % 2 == 0);
 
     if (rtype == RTYPE_MAX) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
+        kernel_filter<dvalue>
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dmax>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_MIN) {
+        kernel_filter<dvalue>
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dmin>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_RMS) {
+        kernel_filter<dsquared>
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dsum>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter<dexp_squared>
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dsum>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else {
         ERROR("Unrecognized rtype");
     }
     AcReal result;
@@ -998,22 +987,19 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
 }
 
 AcReal
-reduce_vec(const cudaStream_t stream, const ReductionType rtype,
-            const int3& start, const int3& end,
-            const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
-            AcReal* scratchpad, AcReal* reduce_result)
+reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
+           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
+           AcReal* reduce_result)
 {
-    const unsigned nx = end.x - start.x;
-    const unsigned ny = end.y - start.y;
-    const unsigned nz = end.z - start.z;
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
     const unsigned num_elems = nx * ny * nz;
 
     const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter(
-        (unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-        (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-        (unsigned int)ceil(nz / AcReal(tpb_filter.z))
-    );
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
 
     const int tpb_reduce = 128;
     const int bpg_reduce = num_elems / tpb_reduce;
@@ -1025,22 +1011,38 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
     ERRCHK(nx * ny * nz % 2 == 0);
 
     if (rtype == RTYPE_MAX) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_MIN) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS) {
-        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else if (rtype == RTYPE_RMS_EXP) {
-        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-    } else {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dmax>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_MIN) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dmin>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_RMS) {
+        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dsum>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else if (rtype == RTYPE_RMS_EXP) {
+        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
+            scratchpad, num_elems);
+        kernel_reduce_block<dsum>
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
+    else {
         ERROR("Unrecognized rtype");
     }
     AcReal result;
diff --git a/src/standalone/autotest.cc b/src/standalone/autotest.cc
index 7c44be3..107b950 100644
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -53,56 +53,58 @@
 #define GEN_TEST_RESULT (1) // Generate a test file always during testing
 
 typedef struct {
-	int x, y, z;
+    int x, y, z;
 } vec3i;
 
 typedef struct {
-	AcReal x, y, z;
+    AcReal x, y, z;
 } vec3r;
 
-
 typedef struct {
-	ModelScalar model;
-	AcReal candidate;
-	ModelScalar error;
+    ModelScalar model;
+    AcReal candidate;
+    ModelScalar error;
 } ErrorInfo;
 
 #define QUICK_TEST (0)
 #define THOROUGH_TEST (1)
 #define TEST_TYPE QUICK_TEST
 
-static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE, INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
+static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE,
+                                      INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
 // #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 
-#if TEST_TYPE == QUICK_TEST // REGULAR TEST START HERE --------------------------------------------------------------------------------------------------------------
-	static inline ModelScalar
+#if TEST_TYPE ==                                                                                   \
+    QUICK_TEST // REGULAR TEST START HERE
+               // --------------------------------------------------------------------------------------------------------------
+static inline ModelScalar
 get_absolute_error(const ModelScalar& model, const AcReal& candidate)
 {
-	return fabsl(candidate - model);
+    return fabsl(candidate - model);
 }
 
-	static inline ModelScalar
+static inline ModelScalar
 get_acceptable_absolute_error(const ModelScalar& range)
 {
-	// This is the upper limit, which assumes that both the min and max values
-	// are used in a calculation (which inherently leads to cancellation).
-	//
-	// AFAIK if this breaks, there is definitely something wrong with the code.
-	// Otherwise the error is so small it's indistiguishable from inherent
-	// inaccuracies in floating-point arithmetic.
-	return range * AC_REAL_EPSILON;
+    // This is the upper limit, which assumes that both the min and max values
+    // are used in a calculation (which inherently leads to cancellation).
+    //
+    // AFAIK if this breaks, there is definitely something wrong with the code.
+    // Otherwise the error is so small it's indistiguishable from inherent
+    // inaccuracies in floating-point arithmetic.
+    return range * AC_REAL_EPSILON;
 }
 
-	static inline ModelScalar
+static inline ModelScalar
 get_acceptable_relative_error(void)
 {
-	return 30; // machine epsilons
+    return 30; // machine epsilons
 }
 
-	static inline ModelScalar
+static inline ModelScalar
 get_relative_error(const ModelScalar& model, const AcReal& candidate)
 {
-	ModelScalar error = NAN;
+    ModelScalar error = NAN;
 
 #if 0
 	const ModelScalar abs_epsilon = get_acceptable_absolute_error(range);
@@ -121,190 +123,182 @@ get_relative_error(const ModelScalar& model, const AcReal& candidate)
 		error = fabsl(1.0l - candidate / model);
 	}
 #endif
-	error = fabsl(1.0l - candidate / model);
+    error = fabsl(1.0l - candidate / model);
 
-	// Return the relative error as multiples of the machine epsilon
-	// See Sect. Relative Error and Ulps in
-	// What Every Computer Scientist Should Know About Floating-Point Arithmetic
-	// By David Goldberg (1991)
-	return error / AC_REAL_EPSILON;
+    // Return the relative error as multiples of the machine epsilon
+    // See Sect. Relative Error and Ulps in
+    // What Every Computer Scientist Should Know About Floating-Point Arithmetic
+    // By David Goldberg (1991)
+    return error / AC_REAL_EPSILON;
 }
 
-	static bool
+static bool
 verify(const ModelScalar& model, const AcReal& cand, const ModelScalar& range)
 {
-	if (!is_valid(model) || !is_valid(cand))
-		return false;
+    if (!is_valid(model) || !is_valid(cand))
+        return false;
 
-	const ModelScalar relative_error = get_relative_error(model, cand);
-	if (relative_error < get_acceptable_relative_error())
-		return true;
+    const ModelScalar relative_error = get_relative_error(model, cand);
+    if (relative_error < get_acceptable_relative_error())
+        return true;
 
-	const ModelScalar absolute_error = get_absolute_error(model, cand);
-	if (absolute_error < get_acceptable_absolute_error(range))
-		return true;
+    const ModelScalar absolute_error = get_absolute_error(model, cand);
+    if (absolute_error < get_acceptable_absolute_error(range))
+        return true;
 
-	return false;
+    return false;
 }
 
-	static ModelScalar
+static ModelScalar
 get_reduction_range(const ModelMesh& mesh)
 {
-	ERRCHK(NUM_VTXBUF_HANDLES >= 3);
+    ERRCHK(NUM_VTXBUF_HANDLES >= 3);
 
-	const ModelScalar max0     = model_reduce_scal(mesh, RTYPE_MAX,
-			VertexBufferHandle(0));
-	const ModelScalar max1     = model_reduce_scal(mesh, RTYPE_MAX,
-			VertexBufferHandle(1));
-	const ModelScalar max2     = model_reduce_scal(mesh, RTYPE_MAX,
-			VertexBufferHandle(2));
-	const ModelScalar max_scal = max(max0, max(max1, max2));
+    const ModelScalar max0     = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(0));
+    const ModelScalar max1     = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(1));
+    const ModelScalar max2     = model_reduce_scal(mesh, RTYPE_MAX, VertexBufferHandle(2));
+    const ModelScalar max_scal = max(max0, max(max1, max2));
 
-	const ModelScalar min0     = model_reduce_scal(mesh, RTYPE_MIN,
-			VertexBufferHandle(0));
-	const ModelScalar min1     = model_reduce_scal(mesh, RTYPE_MIN,
-			VertexBufferHandle(1));
-	const ModelScalar min2     = model_reduce_scal(mesh, RTYPE_MIN,
-			VertexBufferHandle(2));
-	const ModelScalar min_scal = min(min0, min(min1, min2));
+    const ModelScalar min0     = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(0));
+    const ModelScalar min1     = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(1));
+    const ModelScalar min2     = model_reduce_scal(mesh, RTYPE_MIN, VertexBufferHandle(2));
+    const ModelScalar min_scal = min(min0, min(min1, min2));
 
-	return max_scal - min_scal;
-}
-
-	static void
-print_debug_info(const ModelScalar& model, const AcReal& candidate,
-		const ModelScalar& range)
-{
-	printf("MeshPointInfo\n");
-	printf("\tModel: %e\n", double(model));
-	printf("\tCandidate: %e\n", double(candidate));
-	printf("\tRange: %e\n", double(range));
-
-	printf("\tAbsolute error: %Le (max acceptable: %Le)\n",
-			get_absolute_error(model, candidate),
-			get_acceptable_absolute_error(range));
-	printf("\tRelative error: %Le (max acceptable: %Le)\n",
-			get_relative_error(model, candidate),
-			get_acceptable_relative_error());
-	printf("\tIs acceptable: %d\n", verify(model, candidate, range));
+    return max_scal - min_scal;
 }
 
 static void
-print_result(const ModelScalar& model, const AcReal& candidate,
-		const ModelScalar& range, const char* name = "???")
+print_debug_info(const ModelScalar& model, const AcReal& candidate, const ModelScalar& range)
 {
-	const ModelScalar rel_err = get_relative_error(model, candidate);
-	const ModelScalar abs_err = get_absolute_error(model, candidate);
-	if (!verify(model, candidate, range)) {
-		printf("\t%-12s... ", name);
-		printf(RED "FAIL! " RESET);
-	}
-	else {
-		printf("\t%-12s... ", name);
-		printf(GRN "OK! " RESET);
-	}
+    printf("MeshPointInfo\n");
+    printf("\tModel: %e\n", double(model));
+    printf("\tCandidate: %e\n", double(candidate));
+    printf("\tRange: %e\n", double(range));
 
-	printf("(relative error: %.3Lg \u03B5, absolute error: %Lg)\n", rel_err, abs_err);
-	/*
-	// DEPRECATED: TODO remove
-	if (rel_err < get_acceptable_relative_error())
-	printf("(relative error: %Lg \u03B5, max accepted %Lg)\n", rel_err,
-	get_acceptable_relative_error());
-	else
-	printf("(absolute error: %Lg, max accepted %Lg)\n", abs_err,
-	get_acceptable_absolute_error(range));
-	 */
+    printf("\tAbsolute error: %Le (max acceptable: %Le)\n", get_absolute_error(model, candidate),
+           get_acceptable_absolute_error(range));
+    printf("\tRelative error: %Le (max acceptable: %Le)\n", get_relative_error(model, candidate),
+           get_acceptable_relative_error());
+    printf("\tIs acceptable: %d\n", verify(model, candidate, range));
 }
 
-	static int
+static void
+print_result(const ModelScalar& model, const AcReal& candidate, const ModelScalar& range,
+             const char* name = "???")
+{
+    const ModelScalar rel_err = get_relative_error(model, candidate);
+    const ModelScalar abs_err = get_absolute_error(model, candidate);
+    if (!verify(model, candidate, range)) {
+        printf("\t%-12s... ", name);
+        printf(RED "FAIL! " RESET);
+    }
+    else {
+        printf("\t%-12s... ", name);
+        printf(GRN "OK! " RESET);
+    }
+
+    printf("(relative error: %.3Lg \u03B5, absolute error: %Lg)\n", rel_err, abs_err);
+    /*
+    // DEPRECATED: TODO remove
+    if (rel_err < get_acceptable_relative_error())
+    printf("(relative error: %Lg \u03B5, max accepted %Lg)\n", rel_err,
+    get_acceptable_relative_error());
+    else
+    printf("(absolute error: %Lg, max accepted %Lg)\n", abs_err,
+    get_acceptable_absolute_error(range));
+     */
+}
+
+static int
 check_reductions(const AcMeshInfo& config)
 {
-	printf("Testing reductions\n");
-	int num_failures = 0;
+    printf("Testing reductions\n");
+    int num_failures = 0;
 
-	// Init CPU meshes
-	AcMesh* mesh = acmesh_create(config);
-	ModelMesh* modelmesh = modelmesh_create(config);
+    // Init CPU meshes
+    AcMesh* mesh         = acmesh_create(config);
+    ModelMesh* modelmesh = modelmesh_create(config);
 
-	// Init GPU meshes
-	acInit(config);
+    // Init GPU meshes
+    acInit(config);
 
-	for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+    for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
         const InitType itype = test_cases[i];
         printf("Checking %s...\n", init_type_names[InitType(itype)]);
 
-		// Init the mesh and figure out the acceptable range for error
-		acmesh_init_to(InitType(itype), mesh);
+        // Init the mesh and figure out the acceptable range for error
+        acmesh_init_to(InitType(itype), mesh);
 
-		acmesh_to_modelmesh(*mesh, modelmesh);
-		const ModelScalar range = get_reduction_range(*modelmesh);
+        acmesh_to_modelmesh(*mesh, modelmesh);
+        const ModelScalar range = get_reduction_range(*modelmesh);
 
-		acLoad(*mesh);
+        acLoad(*mesh);
 
-		for (int rtype = 0; rtype < NUM_REDUCTION_TYPES; ++rtype) {
-			const VertexBufferHandle ftype = VTXBUF_UUX;
+        for (int rtype = 0; rtype < NUM_REDUCTION_TYPES; ++rtype) {
+            const VertexBufferHandle ftype = VTXBUF_UUX;
 
-			// Scal
-			ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype),
-					VertexBufferHandle(ftype));
-			AcReal candidate  = acReduceScal(ReductionType(rtype),
-					VertexBufferHandle(ftype));
-			print_result(model, candidate, range, "UUX scal");
+            // Scal
+            ModelScalar model = model_reduce_scal(*modelmesh, ReductionType(rtype),
+                                                  VertexBufferHandle(ftype));
+            AcReal candidate  = acReduceScal(ReductionType(rtype), VertexBufferHandle(ftype));
+            print_result(model, candidate, range, "UUX scal");
 
-			bool is_acceptable = verify(model, candidate, range);
-			if (!is_acceptable) {
-				++num_failures;
+            bool is_acceptable = verify(model, candidate, range);
+            if (!is_acceptable) {
+                ++num_failures;
 
-				// Print debug info
-				printf("Scalar reduction type %d FAIL\n", rtype);
-				print_debug_info(model, candidate, range);
-			}
+                // Print debug info
+                printf("Scalar reduction type %d FAIL\n", rtype);
+                print_debug_info(model, candidate, range);
+            }
 
-			// Vec
-			model = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX,
-					VTXBUF_UUY, VTXBUF_UUZ);
-			candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX,
-					VTXBUF_UUY, VTXBUF_UUZ);
-			print_result(model, candidate, range, "UUXYZ vec");
+            // Vec
+            model     = model_reduce_vec(*modelmesh, ReductionType(rtype), VTXBUF_UUX, VTXBUF_UUY,
+                                     VTXBUF_UUZ);
+            candidate = acReduceVec(ReductionType(rtype), VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+            print_result(model, candidate, range, "UUXYZ vec");
 
-			is_acceptable = verify(model, candidate, range);
-			if (!is_acceptable) {
-				++num_failures;
+            is_acceptable = verify(model, candidate, range);
+            if (!is_acceptable) {
+                ++num_failures;
 
-				// Print debug info
-				printf("Vector reduction type %d FAIL\n", rtype);
-				print_debug_info(model, candidate, range);
-			}
-		}
+                // Print debug info
+                printf("Vector reduction type %d FAIL\n", rtype);
+                print_debug_info(model, candidate, range);
+            }
+        }
 
-		printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
-	}
-	acQuit();
-	modelmesh_destroy(modelmesh);
-	acmesh_destroy(mesh);
+        printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n",
+               get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+    }
+    acQuit();
+    modelmesh_destroy(modelmesh);
+    acmesh_destroy(mesh);
 
-	return num_failures;
+    return num_failures;
 }
 
 /** Finds the maximum and minimum in all meshes and computes the range.
  * Note! Potentially dangerous if all meshes do not interact with each other.
  * Otherwise the range may be too high.
  */
-	static ModelScalar
+static ModelScalar
 get_data_range(const ModelMesh& model)
 {
-	ModelScalar vertex_buffer_max_all = -INFINITY;
-	ModelScalar vertex_buffer_min_all = INFINITY;
-	for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
-		const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX, VertexBufferHandle(w));
-		const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN, VertexBufferHandle(w));
+    ModelScalar vertex_buffer_max_all = -INFINITY;
+    ModelScalar vertex_buffer_min_all = INFINITY;
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        const ModelScalar vertex_buffer_max = model_reduce_scal(model, RTYPE_MAX,
+                                                                VertexBufferHandle(w));
+        const ModelScalar vertex_buffer_min = model_reduce_scal(model, RTYPE_MIN,
+                                                                VertexBufferHandle(w));
 
-		if (vertex_buffer_max > vertex_buffer_max_all)
-			vertex_buffer_max_all = vertex_buffer_max;
-		if (vertex_buffer_min < vertex_buffer_min_all)
-			vertex_buffer_min_all = vertex_buffer_min;
-	}
-	return fabsl(vertex_buffer_max_all - vertex_buffer_min_all);
+        if (vertex_buffer_max > vertex_buffer_max_all)
+            vertex_buffer_max_all = vertex_buffer_max;
+        if (vertex_buffer_min < vertex_buffer_min_all)
+            vertex_buffer_min_all = vertex_buffer_min;
+    }
+    return fabsl(vertex_buffer_max_all - vertex_buffer_min_all);
 }
 
 // #define GEN_TEST_RESULT
@@ -312,380 +306,398 @@ get_data_range(const ModelMesh& model)
 static FILE* test_result = NULL;
 #endif
 
-	static bool
+static bool
 verify_meshes(const ModelMesh& model, const AcMesh& candidate)
 {
-	bool retval = true;
+    bool retval = true;
 
 #if GEN_TEST_RESULT == 1
-	ErrorInfo err = ErrorInfo();
+    ErrorInfo err = ErrorInfo();
 #endif
 
-	const ModelScalar range = get_data_range(model);
-	for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
-		const size_t n = AC_VTXBUF_SIZE(model.info);
+    const ModelScalar range = get_data_range(model);
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        const size_t n = AC_VTXBUF_SIZE(model.info);
 
-		// Maximum errors
-		ErrorInfo max_abs_error = ErrorInfo();
-		ErrorInfo max_rel_error = ErrorInfo();
+        // Maximum errors
+        ErrorInfo max_abs_error = ErrorInfo();
+        ErrorInfo max_rel_error = ErrorInfo();
 
-		for (size_t i = 0; i < n; ++i) {
-			const ModelScalar model_val = model.vertex_buffer[VertexBufferHandle(w)][i];
-			const AcReal cand_val = candidate.vertex_buffer[VertexBufferHandle(w)][i];
+        for (size_t i = 0; i < n; ++i) {
+            const ModelScalar model_val = model.vertex_buffer[VertexBufferHandle(w)][i];
+            const AcReal cand_val       = candidate.vertex_buffer[VertexBufferHandle(w)][i];
 
-			if (!verify(model_val, cand_val, range)) {
-				const int i0 = i % model.info.int_params[AC_mx];
-				const int j0 = ((i % (model.info.int_params[AC_mx] *
-								model.info.int_params[AC_my])) /
-						model.info.int_params[AC_mx]);
-				const int k0 = i / (model.info.int_params[AC_mx] *
-						model.info.int_params[AC_my]);
-				printf("Index (%d, %d, %d)\n", i0, j0, k0);
-				print_debug_info(model_val, cand_val, range);
-				retval = false;
-			}
+            if (!verify(model_val, cand_val, range)) {
+                const int i0 = i % model.info.int_params[AC_mx];
+                const int j0 = ((i %
+                                 (model.info.int_params[AC_mx] * model.info.int_params[AC_my])) /
+                                model.info.int_params[AC_mx]);
+                const int k0 = i / (model.info.int_params[AC_mx] * model.info.int_params[AC_my]);
+                printf("Index (%d, %d, %d)\n", i0, j0, k0);
+                print_debug_info(model_val, cand_val, range);
+                retval = false;
+            }
 
-			const ModelScalar abs_error = get_absolute_error(model_val,
-					cand_val);
-			if (abs_error > max_abs_error.error) {
-				max_abs_error.error     = abs_error;
-				max_abs_error.model     = model_val;
-				max_abs_error.candidate = cand_val;
-			}
+            const ModelScalar abs_error = get_absolute_error(model_val, cand_val);
+            if (abs_error > max_abs_error.error) {
+                max_abs_error.error     = abs_error;
+                max_abs_error.model     = model_val;
+                max_abs_error.candidate = cand_val;
+            }
 
-			const ModelScalar rel_error = get_relative_error(model_val, cand_val);
-			if (rel_error > max_rel_error.error) {
-				max_rel_error.error     = rel_error;
-				max_rel_error.model     = model_val;
-				max_rel_error.candidate = cand_val;
-			}
+            const ModelScalar rel_error = get_relative_error(model_val, cand_val);
+            if (rel_error > max_rel_error.error) {
+                max_rel_error.error     = rel_error;
+                max_rel_error.model     = model_val;
+                max_rel_error.candidate = cand_val;
+            }
 
 #if GEN_TEST_RESULT == 1
-			if (abs_error > err.error) {
-				err.error = abs_error;
-				err.model = model_val;
-				err.candidate = cand_val;
-			}
+            if (abs_error > err.error) {
+                err.error     = abs_error;
+                err.model     = model_val;
+                err.candidate = cand_val;
+            }
 #endif
-		}
-		//print_result(max_rel_error.model, max_rel_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
-		print_result(max_abs_error.model, max_abs_error.candidate, range, vtxbuf_names[VertexBufferHandle(w)]);
-	}
+        }
+        // print_result(max_rel_error.model, max_rel_error.candidate, range,
+        // vtxbuf_names[VertexBufferHandle(w)]);
+        print_result(max_abs_error.model, max_abs_error.candidate, range,
+                     vtxbuf_names[VertexBufferHandle(w)]);
+    }
 
 #if GEN_TEST_RESULT == 1
-	const ModelScalar rel_err = get_relative_error(err.model, err.candidate);
-	const ModelScalar abs_err = get_absolute_error(err.model, err.candidate);
-	fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err);
+    const ModelScalar rel_err = get_relative_error(err.model, err.candidate);
+    const ModelScalar abs_err = get_absolute_error(err.model, err.candidate);
+    fprintf(test_result, "%.3Lg & %.3Lg\n", abs_err, rel_err);
 #endif
 
-	printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n", get_acceptable_relative_error(), get_acceptable_absolute_error(range));
+    printf("Acceptable relative error: < %Lg \u03B5, absolute error < %Lg\n",
+           get_acceptable_relative_error(), get_acceptable_absolute_error(range));
 
-	return retval;
+    return retval;
 }
 
-	int
+int
 check_rk3(const AcMeshInfo& mesh_info)
 {
-	const int num_iterations = 1; // Note: should work up to at least 15 steps
-	printf("Testing RK3 (running %d steps before checking the result)\n",
-			num_iterations);
-	int num_failures = 0;
+    const int num_iterations = 1; // Note: should work up to at least 15 steps
+    printf("Testing RK3 (running %d steps before checking the result)\n", num_iterations);
+    int num_failures = 0;
 
-	// Init CPU meshes
-	AcMesh* gpu_mesh   = acmesh_create(mesh_info);
-	ModelMesh* model_mesh = modelmesh_create(mesh_info);
+    // Init CPU meshes
+    AcMesh* gpu_mesh      = acmesh_create(mesh_info);
+    ModelMesh* model_mesh = modelmesh_create(mesh_info);
 
-	// Init GPU meshes
-	acInit(mesh_info);
+    // Init GPU meshes
+    acInit(mesh_info);
 
-	for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
+    for (unsigned int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
         const InitType itype = test_cases[i];
-		printf("Checking %s...\n", init_type_names[InitType(itype)]);
+        printf("Checking %s...\n", init_type_names[InitType(itype)]);
 
-		// Init the mesh and figure out the acceptable range for error
-		acmesh_init_to(InitType(itype), gpu_mesh);
+        // Init the mesh and figure out the acceptable range for error
+        acmesh_init_to(InitType(itype), gpu_mesh);
 
-		acLoad(*gpu_mesh);
-		acmesh_to_modelmesh(*gpu_mesh, model_mesh);
+        acLoad(*gpu_mesh);
+        acmesh_to_modelmesh(*gpu_mesh, model_mesh);
 
-		acBoundcondStep();
-		boundconds(model_mesh->info, model_mesh);
+        acBoundcondStep();
+        boundconds(model_mesh->info, model_mesh);
 
-		for (int i = 0; i < num_iterations; ++i) {
-			//const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
-            //const AcReal dt   = host_timestep(umax, mesh_info);
-			const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
+        for (int i = 0; i < num_iterations; ++i) {
+            // const AcReal umax = AcReal(acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+            // VTXBUF_UUZ));
+            // const AcReal dt   = host_timestep(umax, mesh_info);
+            const AcReal dt = AcReal(1e-2); // Use a small constant timestep to avoid instabilities
 
-			acIntegrate(dt);
-			acBoundcondStep();
-			acSynchronize();
+            acIntegrate(dt);
+            acBoundcondStep();
+            acSynchronize();
 
-			model_rk3(dt, model_mesh);
-			boundconds(model_mesh->info, model_mesh);
-		}
-		acStore(gpu_mesh);
+            model_rk3(dt, model_mesh);
+            boundconds(model_mesh->info, model_mesh);
+        }
+        acStore(gpu_mesh);
 
-		bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);
-		if (!is_acceptable) {
-			++num_failures;
-		}
-	}
+        bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);
+        if (!is_acceptable) {
+            ++num_failures;
+        }
+    }
 
-	acQuit();
-	acmesh_destroy(gpu_mesh);
-	modelmesh_destroy(model_mesh);
+    acQuit();
+    acmesh_destroy(gpu_mesh);
+    modelmesh_destroy(model_mesh);
 
-	return num_failures;
+    return num_failures;
 }
 
-	int
+int
 run_autotest(void)
 {
 #if GEN_TEST_RESULT == 1
-	char testresult_path[256];
-	sprintf(testresult_path, "%s_fullstep_testresult.out", AC_DOUBLE_PRECISION ? "double" : "float");
+    char testresult_path[256];
+    sprintf(testresult_path, "%s_fullstep_testresult.out",
+            AC_DOUBLE_PRECISION ? "double" : "float");
 
-	test_result = fopen(testresult_path, "w");
-	ERRCHK(test_result);
+    test_result = fopen(testresult_path, "w");
+    ERRCHK(test_result);
 
-	fprintf(test_result, "n, max abs error, corresponding rel error\n");
+    fprintf(test_result, "n, max abs error, corresponding rel error\n");
 #endif
 
-	/* Parse configs */
-	AcMeshInfo config;
-	load_config(&config);
+    /* Parse configs */
+    AcMeshInfo config;
+    load_config(&config);
 
-	if (STENCIL_ORDER > 6)
-		printf("WARNING!!! If the stencil order is larger than the computational domain some vertices may be done twice (f.ex. doing inner and outer domains separately and some of the front/back/left/right/etc slabs collide). The mesh must be large enough s.t. this doesn't happen.");
+    if (STENCIL_ORDER > 6)
+        printf("WARNING!!! If the stencil order is larger than the computational domain some "
+               "vertices may be done twice (f.ex. doing inner and outer domains separately and "
+               "some of the front/back/left/right/etc slabs collide). The mesh must be large "
+               "enough s.t. this doesn't happen.");
 
-    const vec3i test_dims[] = {
-        {32, 32, 32},
-        {64, 32, 32},
-        {32, 64, 32},
-        {32, 32, 64},
-        {64, 64, 32},
-        {64, 32, 64},
-        {32, 64, 64}
-    };
+    const vec3i test_dims[] = {{32, 32, 32}, {64, 32, 32}, {32, 64, 32}, {32, 32, 64},
+                               {64, 64, 32}, {64, 32, 64}, {32, 64, 64}};
 
-	int num_failures = 0;
-	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
-		config.int_params[AC_nx] = test_dims[i].x;
-		config.int_params[AC_ny] = test_dims[i].y;
-		config.int_params[AC_nz] = test_dims[i].z;
-		update_config(&config);
+    int num_failures = 0;
+    for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+        config.int_params[AC_nx] = test_dims[i].x;
+        config.int_params[AC_ny] = test_dims[i].y;
+        config.int_params[AC_nz] = test_dims[i].z;
+        update_config(&config);
 
-		printf("Testing mesh (%d, %d, %d):\n", //
-				test_dims[i].x, test_dims[i].y, test_dims[i].z);
+        printf("Testing mesh (%d, %d, %d):\n", //
+               test_dims[i].x, test_dims[i].y, test_dims[i].z);
 
-		num_failures += check_reductions(config);
-		fflush(stdout);
-	}
+        num_failures += check_reductions(config);
+        fflush(stdout);
+    }
 
-	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
-		config.int_params[AC_nx] = test_dims[i].x;
-		config.int_params[AC_ny] = test_dims[i].y;
-		config.int_params[AC_nz] = test_dims[i].z;
-		update_config(&config);
+    for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+        config.int_params[AC_nx] = test_dims[i].x;
+        config.int_params[AC_ny] = test_dims[i].y;
+        config.int_params[AC_nz] = test_dims[i].z;
+        update_config(&config);
 
-		printf("Testing mesh (%d, %d, %d):\n", //
-				test_dims[i].x, test_dims[i].y, test_dims[i].z);
+        printf("Testing mesh (%d, %d, %d):\n", //
+               test_dims[i].x, test_dims[i].y, test_dims[i].z);
 
-		num_failures += check_rk3(config);
-		fflush(stdout);
-	}
+        num_failures += check_rk3(config);
+        fflush(stdout);
+    }
 
-	printf("\n--------Testing done---------\n");
-	printf("Failures found: %d\n", num_failures);
+    printf("\n--------Testing done---------\n");
+    printf("Failures found: %d\n", num_failures);
 
 #if GEN_TEST_RESULT == 1
-	fflush(test_result);
-	fclose(test_result);
+    fflush(test_result);
+    fclose(test_result);
 #endif
 
-	if (num_failures > 0)
-		return EXIT_FAILURE;
-	else
-		return EXIT_SUCCESS;
+    if (num_failures > 0)
+        return EXIT_FAILURE;
+    else
+        return EXIT_SUCCESS;
 }
 
-#elif TEST_TYPE == THOROUGH_TEST // GEN TEST FILE START HERE --------------------------------------------------------------------------------------------------------------
+#elif TEST_TYPE ==                                                                                 \
+    THOROUGH_TEST // GEN TEST FILE START HERE
+                  // --------------------------------------------------------------------------------------------------------------
 typedef struct {
-	ModelScalar model;
-	AcReal candidate;
-	ModelScalar abs_error;
-	ModelScalar ulp_error;
-	ModelScalar rel_error;
-	ModelScalar maximum_magnitude;
-	ModelScalar minimum_magnitude;
+    ModelScalar model;
+    AcReal candidate;
+    ModelScalar abs_error;
+    ModelScalar ulp_error;
+    ModelScalar rel_error;
+    ModelScalar maximum_magnitude;
+    ModelScalar minimum_magnitude;
 } Error;
 
-Error get_error(ModelScalar model, AcReal candidate)
+Error
+get_error(ModelScalar model, AcReal candidate)
 {
-	Error error;
+    Error error;
+    error.abs_error = 0;
+
+    error.model     = model;
+    error.candidate = candidate;
+
+    if (error.model == error.candidate || fabsl(model - candidate) == 0) { // If exact
         error.abs_error = 0;
+        error.rel_error = 0;
+        error.ulp_error = 0;
+    }
+    else if (!is_valid(error.model) || !is_valid(error.candidate)) {
+        error.abs_error = INFINITY;
+        error.rel_error = INFINITY;
+        error.ulp_error = INFINITY;
+    }
+    else {
+        const int base = 2;
+        const int p    = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant
 
-	error.model = model;
-	error.candidate = candidate;
+        const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2));
 
-	if (error.model == error.candidate || fabsl(model - candidate) == 0) { // If exact
-		error.abs_error = 0;
-		error.rel_error = 0;
-		error.ulp_error = 0;
-	} else if (!is_valid(error.model) || !is_valid(error.candidate)) {
-		error.abs_error = INFINITY;
-		error.rel_error = INFINITY;
-		error.ulp_error = INFINITY;
-	} else {
-		const int base = 2;
-		const int p = sizeof(AcReal) == 4 ? 24 : 53; // Bits in the significant
+        const ModelScalar ulp             = powl(base, e - (p - 1));
+        const ModelScalar machine_epsilon = 0.5 * powl(base, -(p - 1));
+        error.abs_error                   = fabsl(model - candidate);
+        error.ulp_error                   = error.abs_error / ulp;
+        error.rel_error                   = fabsl(1.0l - candidate / model) / machine_epsilon;
+    }
 
-		const ModelScalar e = floorl(logl(fabsl(error.model)) / logl(2));
-
-		const ModelScalar ulp = powl(base, e - (p-1));
-		const ModelScalar machine_epsilon = 0.5 * powl(base, -(p-1));
-		error.abs_error = fabsl(model - candidate);
-		error.ulp_error	= error.abs_error / ulp;
-		error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon;
-	}
-
-	return error;
+    return error;
 }
 
-Error get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+Error
+get_max_abs_error_mesh(const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
 {
-	Error error;
-        error.abs_error = -1;
+    Error error;
+    error.abs_error = -1;
 
-	for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) {
-		for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
-			Error curr_error = get_error(model_mesh.vertex_buffer[j][i], candidate_mesh.vertex_buffer[j][i]);
-			if (curr_error.abs_error > error.abs_error)
-				error = curr_error;
-		}
-	}
+    for (size_t j = 0; j < NUM_VTXBUF_HANDLES; ++j) {
+        for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
+            Error curr_error = get_error(model_mesh.vertex_buffer[j][i],
+                                         candidate_mesh.vertex_buffer[j][i]);
+            if (curr_error.abs_error > error.abs_error)
+                error = curr_error;
+        }
+    }
 
-	error.maximum_magnitude = -1; // Not calculated.
-	error.minimum_magnitude = -1; // Not calculated.
+    error.maximum_magnitude = -1; // Not calculated.
+    error.minimum_magnitude = -1; // Not calculated.
 
-	return error;
+    return error;
 }
 
 static ModelScalar
 get_maximum_magnitude(const ModelScalar* field, const AcMeshInfo info)
 {
-	ModelScalar maximum = -INFINITY;
+    ModelScalar maximum = -INFINITY;
 
-	for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
-		maximum = max(maximum, fabsl(field[i]));
+    for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
+        maximum = max(maximum, fabsl(field[i]));
 
-	return maximum;
+    return maximum;
 }
 
-
 static ModelScalar
 get_minimum_magnitude(const ModelScalar* field, const AcMeshInfo info)
 {
-	ModelScalar minimum = INFINITY;
+    ModelScalar minimum = INFINITY;
 
-	for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
-		minimum = min(minimum, fabsl(field[i]));
+    for (size_t i = 0; i < AC_VTXBUF_SIZE(info); ++i)
+        minimum = min(minimum, fabsl(field[i]));
 
-	return minimum;
+    return minimum;
 }
 
-Error get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh, const AcMesh& candidate_mesh)
+Error
+get_max_abs_error_vtxbuf(const VertexBufferHandle vtxbuf_handle, const ModelMesh& model_mesh,
+                         const AcMesh& candidate_mesh)
 {
-	ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle];
-	AcReal* candidate_vtxbuf = candidate_mesh.vertex_buffer[vtxbuf_handle];
+    ModelScalar* model_vtxbuf = model_mesh.vertex_buffer[vtxbuf_handle];
+    AcReal* candidate_vtxbuf  = candidate_mesh.vertex_buffer[vtxbuf_handle];
 
-	Error error;
-        error.abs_error = -1;
+    Error error;
+    error.abs_error = -1;
 
-	for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
+    for (size_t i = 0; i < AC_VTXBUF_SIZE(model_mesh.info); ++i) {
 
-		Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
+        Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
 
-		if (curr_error.abs_error > error.abs_error)
-			error = curr_error;
-	}
+        if (curr_error.abs_error > error.abs_error)
+            error = curr_error;
+    }
 
+    error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
+    error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
 
-	error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
-	error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
-
-	return error;
+    return error;
 }
 
 void
 print_error_to_file(const char* path, const int n, const Error error)
 {
     FILE* file = fopen(path, "a");
-    fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error, error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
-    //fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude, error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
+    fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.ulp_error, error.abs_error,
+            error.rel_error, error.maximum_magnitude, error.minimum_magnitude);
+    // fprintf(file, "%d, %Lg, %Lg, %Lg, %Lg, %Lg\n", n, error.maximum_magnitude,
+    // error.minimum_magnitude, error.abs_error, error.ulp_error, error.rel_error);
     fclose(file);
 }
 
 #define MAX_PATH_LEN (256)
 
-int run_autotest(void)
+int
+run_autotest(void)
 {
 
 #define N_MIN (32)
 #define N_MAX (512)
-	for (int n = N_MIN; n <= N_MAX; n += N_MIN) {
-		AcMeshInfo config;
-		load_config(&config);
-		config.int_params[AC_nx] = config.int_params[AC_ny] = config.int_params[AC_nz] = n;
-		update_config(&config);
+    for (int n = N_MIN; n <= N_MAX; n += N_MIN) {
+        AcMeshInfo config;
+        load_config(&config);
+        config.int_params[AC_nx] = config.int_params[AC_ny] = config.int_params[AC_nz] = n;
+        update_config(&config);
 
-		// Init host
-		AcMesh* candidate_mesh = acmesh_create(config);
-		ModelMesh* model_mesh = modelmesh_create(config);
+        // Init host
+        AcMesh* candidate_mesh = acmesh_create(config);
+        ModelMesh* model_mesh  = modelmesh_create(config);
 
-		// Init device
-		acInit(config);
+        // Init device
+        acInit(config);
 
-		// Check all initial conditions
+        // Check all initial conditions
         for (int i = 0; i < ARRAY_SIZE(test_cases); ++i) {
             const InitType init_type = test_cases[i];
-			acmesh_init_to((InitType)init_type, candidate_mesh);
-			acmesh_to_modelmesh(*candidate_mesh, model_mesh);   // Load to Host
-			acLoad(*candidate_mesh);                             // Load to Device
+            acmesh_init_to((InitType)init_type, candidate_mesh);
+            acmesh_to_modelmesh(*candidate_mesh, model_mesh); // Load to Host
+            acLoad(*candidate_mesh);                          // Load to Device
 
-			boundconds(model_mesh->info, model_mesh);
-			acBoundcondStep();
+            boundconds(model_mesh->info, model_mesh);
+            acBoundcondStep();
 
             { // Check boundconds
                 acStore(candidate_mesh);
                 Error boundcond_error = get_max_abs_error_mesh(*model_mesh, *candidate_mesh);
                 char boundcond_path[MAX_PATH_LEN];
-                sprintf(boundcond_path, "%s_boundcond_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                sprintf(boundcond_path, "%s_boundcond_%s.testresult",
+                        AC_DOUBLE_PRECISION ? "double" : "float",
+                        init_type_names[(InitType)init_type]);
                 print_error_to_file(boundcond_path, n, boundcond_error);
             }
 
             { // Check scalar max reduction
-                ModelScalar model = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX);
-                AcReal candidate = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX);
+                ModelScalar model         = model_reduce_scal(*model_mesh, (ReductionType)RTYPE_MAX,
+                                                      VTXBUF_UUX);
+                AcReal candidate          = acReduceScal((ReductionType)RTYPE_MAX, VTXBUF_UUX);
                 Error scalar_reduce_error = get_error(model, candidate);
                 char scalar_reduce_path[MAX_PATH_LEN];
-                sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                sprintf(scalar_reduce_path, "%s_scalar_reduce_%s.testresult",
+                        AC_DOUBLE_PRECISION ? "double" : "float",
+                        init_type_names[(InitType)init_type]);
                 print_error_to_file(scalar_reduce_path, n, scalar_reduce_error);
             }
 
             { // Check vector max reduction
-                ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-                AcReal candidate = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                ModelScalar model = model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX,
+                                                     VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                AcReal candidate  = acReduceVec((ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+                                               VTXBUF_UUZ);
                 Error vector_reduce_error = get_error(model, candidate);
                 char vector_reduce_path[MAX_PATH_LEN];
-                sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult",  AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type]);
+                sprintf(vector_reduce_path, "%s_vector_reduce_%s.testresult",
+                        AC_DOUBLE_PRECISION ? "double" : "float",
+                        init_type_names[(InitType)init_type]);
                 print_error_to_file(vector_reduce_path, n, vector_reduce_error);
             }
 
             // Time advance
             {
-                const AcReal umax =  (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-                const AcReal dt = host_timestep(umax, config);
+                const AcReal umax = (AcReal)model_reduce_vec(*model_mesh, (ReductionType)RTYPE_MAX,
+                                                             VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+                const AcReal dt   = host_timestep(umax, config);
 
                 // Host integration step
                 model_rk3(dt, model_mesh);
@@ -699,25 +711,31 @@ int run_autotest(void)
 
                 // Check fields
                 for (int vtxbuf_handle = 0; vtxbuf_handle < NUM_VTXBUF_HANDLES; ++vtxbuf_handle) {
-                    Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle, *model_mesh, *candidate_mesh);
+                    Error field_error = get_max_abs_error_vtxbuf((VertexBufferHandle)vtxbuf_handle,
+                                                                 *model_mesh, *candidate_mesh);
 
-			printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n", (ModelScalar)field_error.model, (ModelScalar)field_error.candidate, (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);
+                    printf("model %Lg, cand %Lg, abs %Lg, rel %Lg\n",
+                           (ModelScalar)field_error.model, (ModelScalar)field_error.candidate,
+                           (ModelScalar)field_error.abs_error, (ModelScalar)field_error.rel_error);
 
                     char field_path[MAX_PATH_LEN];
-                    sprintf(field_path, "%s_integrationstep_%s_%s.testresult", AC_DOUBLE_PRECISION ? "double" : "float", init_type_names[(InitType)init_type], vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
+                    sprintf(field_path, "%s_integrationstep_%s_%s.testresult",
+                            AC_DOUBLE_PRECISION ? "double" : "float",
+                            init_type_names[(InitType)init_type],
+                            vtxbuf_names[(VertexBufferHandle)vtxbuf_handle]);
                     print_error_to_file(field_path, n, field_error);
                 }
             }
-		}
+        }
 
-		// Deallocate host
-		acmesh_destroy(candidate_mesh);
-		modelmesh_destroy(model_mesh);
+        // Deallocate host
+        acmesh_destroy(candidate_mesh);
+        modelmesh_destroy(model_mesh);
 
-		// Deallocate device
-		acQuit();
-	}
+        // Deallocate device
+        acQuit();
+    }
 
-	return 0;
+    return 0;
 }
 #endif
diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc
index a5f163a..05f4b32 100644
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -35,10 +35,10 @@
 #include "model/model_rk3.h"
 #include "timer_hires.h"
 
-#include <vector>
+#include "src/core/errchk.h"
 #include <algorithm>
 #include <math.h>
-#include "src/core/errchk.h"
+#include <vector>
 
 static bool
 smaller_than(const double& a, const double& b)
@@ -47,7 +47,8 @@ smaller_than(const double& a, const double& b)
 }
 
 static int
-write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc)
+write_runningtimes(const char* path, const int n, const double min, const double max,
+                   const double median, const double perc)
 {
     FILE* fp;
     fp = fopen(path, "a");
@@ -80,7 +81,8 @@ int
 run_benchmark(void)
 {
     char runningtime_path[256];
-    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float",
+            GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
 
     FILE* fp;
     fp = fopen(runningtime_path, "w");
@@ -88,13 +90,14 @@ run_benchmark(void)
     if (fp != NULL) {
         fprintf(fp, "n, min, max, median, perc\n");
         fclose(fp);
-    } else {
+    }
+    else {
         return EXIT_FAILURE;
     }
 
-    #define N_STEP_SIZE (128)
-    #define MAX_MESH_DIM (128)
-    #define NUM_ITERS (100)
+#define N_STEP_SIZE (128)
+#define MAX_MESH_DIM (128)
+#define NUM_ITERS (100)
     for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
         /* Parse configs */
         AcMeshInfo mesh_info;
@@ -113,7 +116,6 @@ run_benchmark(void)
         std::vector<double> results;
         results.reserve(NUM_ITERS);
 
-
         // Warmup
         for (int i = 0; i < 10; ++i) {
             acIntegrate(0);
@@ -124,28 +126,35 @@ run_benchmark(void)
         for (int i = 0; i < NUM_ITERS; ++i) {
 
             timer_reset(&t);
-            #if GEN_BENCHMARK_RK3 == 1
+#if GEN_BENCHMARK_RK3 == 1
             acIntegrateStep(2, FLT_EPSILON);
-            #else // GEN_BENCHMARK_FULL
-            //const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-            const AcReal dt   = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
+#else // GEN_BENCHMARK_FULL
+            // const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+            const AcReal dt = AcReal(
+                1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
             acIntegrate(dt);
-            #endif
+#endif
             acSynchronize();
 
             const double ms_elapsed = timer_diff_nsec(t) / 1e6;
             results.push_back(ms_elapsed);
         }
 
-        #define NTH_PERCENTILE (0.95)
+#define NTH_PERCENTILE (0.95)
         std::sort(results.begin(), results.end(), smaller_than);
-        write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+        write_runningtimes(runningtime_path, n, results[0], results[results.size() - 1],
+                           results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
 
         char percentile_path[256];
-        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n,
+                AC_DOUBLE_PRECISION ? "double" : "float",
+                GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
         write_percentiles(percentile_path, NUM_ITERS, results);
 
-        printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]);
+        printf("%s running time %g ms, (%dth percentile, nx = %d) \n",
+               GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep",
+               double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
+               mesh_info.int_params[AC_nx]);
 
         acStore(mesh);
         acQuit();
@@ -225,7 +234,8 @@ run_benchmark(void)
     return 0;
 }
 
-#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
+#else
+//////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
 
 
 
@@ -290,8 +300,8 @@ run_benchmark(void)
 
         #define NTH_PERCENTILE (0.95)
         std::sort(results.begin(), results.end(), smaller_than);
-        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
-        write_percentiles(n, NUM_ITERS, results);
+        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)],
+results[int(NTH_PERCENTILE * NUM_ITERS)]); write_percentiles(n, NUM_ITERS, results);
     }
 
     return 0;
diff --git a/src/standalone/config_loader.cc b/src/standalone/config_loader.cc
index 6054802..637bcd4 100644
--- a/src/standalone/config_loader.cc
+++ b/src/standalone/config_loader.cc
@@ -63,7 +63,7 @@ parse_config(const char* path, AcMeshInfo* config)
 {
     FILE* fp;
     fp = fopen(path, "r");
-    // For knowing which .conf file will be used 
+    // For knowing which .conf file will be used
     printf("Config file path: \n %s \n ", path);
     ERRCHK(fp != NULL);
 
@@ -79,8 +79,7 @@ parse_config(const char* path, AcMeshInfo* config)
         int idx = -1;
         if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
             config->int_params[idx] = atoi(value);
-        else if ((idx = find_str(keyword, realparam_names,
-                                 NUM_REAL_PARAM_TYPES)) >= 0)
+        else if ((idx = find_str(keyword, realparam_names, NUM_REAL_PARAM_TYPES)) >= 0)
             config->real_params[idx] = AcReal(atof(value));
     }
 
@@ -92,68 +91,66 @@ update_config(AcMeshInfo* config)
 {
     config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
     ///////////// PAD TEST
-    //config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
     ///////////// PAD TEST
     config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
     config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
 
     // Bounds for the computational domain, i.e. nx_min <= i < nx_max
     config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] +
-                                    config->int_params[AC_nx];
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
     config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_ny_max] = config->int_params[AC_ny] +
-                                    STENCIL_ORDER / 2;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
     config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nz_max] = config->int_params[AC_nz] +
-                                    STENCIL_ORDER / 2;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
 
     // Spacing
     config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
     config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
     config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
-    config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
+    config->real_params[AC_dsmin]   = min(
+        config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
 
     // Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
-    config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx]; 
-    config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my];
-    config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz];
+    config->real_params[AC_xlen] = config->real_params[AC_dsx] * config->int_params[AC_mx];
+    config->real_params[AC_ylen] = config->real_params[AC_dsy] * config->int_params[AC_my];
+    config->real_params[AC_zlen] = config->real_params[AC_dsz] * config->int_params[AC_mz];
 
-    config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];  
-    config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen]; 
-    config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen]; 
+    config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];
+    config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen];
+    config->real_params[AC_zorig] = AcReal(.5) * config->real_params[AC_zlen];
 
     /* Additional helper params */
     // Int helpers
-    config->int_params[AC_mxy] = config->int_params[AC_mx] *
-                                 config->int_params[AC_my];
-    config->int_params[AC_nxy] = config->int_params[AC_nx] *
-                                 config->int_params[AC_ny];
-    config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
-                                  config->int_params[AC_nz];
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
 
     // Real helpers
     config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
                                         config->real_params[AC_cs_sound];
 
-    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma];
+    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] /
+                                       config->real_params[AC_gamma];
 
-    AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
-    AcReal M_sun       = AcReal(1.989e33);  // g solar mass
+    AcReal G_CONST_CGS = AcReal(
+        6.674e-8);                   // g/cm3/s GGS definition //TODO define in a separate module
+    AcReal M_sun = AcReal(1.989e33); // g solar mass
 
-    config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun / 
-                                     ( (config->real_params[AC_unit_length]*
-                                        config->real_params[AC_unit_length]*
-                                        config->real_params[AC_unit_length]) * 
-                                        config->real_params[AC_unit_density] ) ;
+    config->real_params[AC_M_star] = config->real_params[AC_M_star] * M_sun /
+                                     ((config->real_params[AC_unit_length] *
+                                       config->real_params[AC_unit_length] *
+                                       config->real_params[AC_unit_length]) *
+                                      config->real_params[AC_unit_density]);
 
-    config->real_params[AC_G_CONST] = G_CONST_CGS / 
-                                      ( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) /
-                                        (config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ;
-
-    config->real_params[AC_GM_star]  = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
-    config->real_params[AC_sq2GM_star]  = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star]));
+    config->real_params[AC_G_CONST] = G_CONST_CGS / ((config->real_params[AC_unit_velocity] *
+                                                      config->real_params[AC_unit_velocity]) /
+                                                     (config->real_params[AC_unit_density] *
+                                                      config->real_params[AC_unit_length]));
 
+    config->real_params[AC_GM_star] = config->real_params[AC_M_star] *
+                                      config->real_params[AC_G_CONST];
+    config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2) * config->real_params[AC_GM_star]));
 
     const bool print_config = true;
     if (print_config) {
diff --git a/src/standalone/renderer.cc b/src/standalone/renderer.cc
index 7f5f3e7..9024627 100644
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -50,12 +50,12 @@ static const int window_bpp = 32; // Bits per pixel
 SDL_Surface* surfaces[NUM_VTXBUF_HANDLES];
 static int datasurface_width  = -1;
 static int datasurface_height = -1;
-static int k_slice = 0;
-static int k_slice_max = 0;
+static int k_slice            = 0;
+static int k_slice_max        = 0;
 
 // Colors
-static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255};
-static const int num_tiles = NUM_VTXBUF_HANDLES + 1;
+static SDL_Color color_bg      = (SDL_Color){30, 30, 35, 255};
+static const int num_tiles     = NUM_VTXBUF_HANDLES + 1;
 static const int tiles_per_row = 3;
 
 /*
@@ -82,10 +82,9 @@ static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
 static inline vec4
 project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
 {
-    const vec4 rect = (vec4){
-        camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
-        camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
-        camera.scale * bbox.x, camera.scale * bbox.y};
+    const vec4 rect = (vec4){camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
+                             camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
+                             camera.scale * bbox.x, camera.scale * bbox.y};
 
     return rect;
 }
@@ -103,13 +102,12 @@ renderer_init(const int& mx, const int& my)
     SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
 
     // Setup window
-    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED,
-                              SDL_WINDOWPOS_UNDEFINED, window_width,
-                              window_height, SDL_WINDOW_SHOWN);
+    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
+                              window_width, window_height, SDL_WINDOW_SHOWN);
 
     // Setup SDL renderer
     renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
-    //SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
+    // SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
     SDL_GetWindowSize(window, &window_width, &window_height);
 
     SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
@@ -118,24 +116,24 @@ renderer_init(const int& mx, const int& my)
     datasurface_height = my;
     // vec drawing uses the surface of the first component, no memory issues here
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(
-            0, datasurface_width, datasurface_height, window_bpp,
-            SDL_PIXELFORMAT_RGBA8888);
+        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(0, datasurface_width, datasurface_height,
+                                                     window_bpp, SDL_PIXELFORMAT_RGBA8888);
 
-    camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
-                          -.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height};
+    camera.pos   = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
+                          -.5f * (num_tiles / tiles_per_row) * datasurface_height +
+                              .5f * datasurface_height};
     camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
-                       window_height / float(datasurface_height * (num_tiles/tiles_per_row)));
+                       window_height / float(datasurface_height * (num_tiles / tiles_per_row)));
 
     SDL_RendererInfo renderer_info;
     SDL_GetRendererInfo(renderer, &renderer_info);
-    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height);
+    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width,
+           renderer_info.max_texture_height);
     return 0;
 }
 
 static int
-set_pixel(const int& i, const int& j, const uint32_t& color,
-          SDL_Surface* surface)
+set_pixel(const int& i, const int& j, const uint32_t& color, SDL_Surface* surface)
 {
     uint32_t* pixels           = (uint32_t*)surface->pixels;
     pixels[i + j * surface->w] = color;
@@ -143,22 +141,21 @@ set_pixel(const int& i, const int& j, const uint32_t& color,
 }
 
 static int
-draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
-                   const int& tile)
+draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, const int& tile)
 {
     const float xoffset = (tile % tiles_per_row) * datasurface_width;
-    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+    const float yoffset = -(tile / tiles_per_row) * datasurface_height;
 
     /*
     const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
     const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
     */
-    const float max = float(acReduceScal(RTYPE_MAX, vertex_buffer));
-    const float min = float(acReduceScal(RTYPE_MIN, vertex_buffer));
+    const float max   = float(acReduceScal(RTYPE_MAX, vertex_buffer));
+    const float min   = float(acReduceScal(RTYPE_MIN, vertex_buffer));
     const float range = fabsf(max - min);
     const float mid   = max - .5f * range;
 
-    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+    const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
 
     for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
         for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
@@ -166,29 +163,23 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
 
             const int idx       = AC_VTXBUF_IDX(i, j, k, mesh.info);
             const uint8_t shade = (uint8_t)(
-                255.f *
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
-                range);
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) / range);
             uint8_t color[4]            = {0, 0, 0, 255};
             color[tile % 3]             = shade;
-            const uint32_t mapped_color = SDL_MapRGBA(
-                surfaces[vertex_buffer]->format, color[0], color[1], color[2],
-                color[3]);
+            const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer]->format, color[0],
+                                                      color[1], color[2], color[3]);
             set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
         }
     }
 
     const float2 pos   = (float2){xoffset, yoffset};
-    const float2 bbox  = (float2){.5f * datasurface_width,
-                                 .5f * datasurface_height};
+    const float2 bbox  = (float2){.5f * datasurface_width, .5f * datasurface_height};
     const float2 wsize = (float2){float(window_width), float(window_height)};
     const vec4 rectf   = project_ortho(pos, bbox, wsize);
-    SDL_Rect rect      = (SDL_Rect){
-        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
-        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
+    SDL_Rect rect      = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+                               int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
 
-    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
-                                                    surfaces[vertex_buffer]);
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer]);
     SDL_RenderCopy(renderer, tex, NULL, &rect);
     SDL_DestroyTexture(tex);
 
@@ -196,14 +187,12 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
 }
 
 static int
-draw_vertex_buffer_vec(const AcMesh& mesh,
-                       const VertexBufferHandle& vertex_buffer_a,
+draw_vertex_buffer_vec(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer_a,
                        const VertexBufferHandle& vertex_buffer_b,
-                       const VertexBufferHandle& vertex_buffer_c,
-                       const int& tile)
+                       const VertexBufferHandle& vertex_buffer_c, const int& tile)
 {
     const float xoffset = (tile % tiles_per_row) * datasurface_width;
-    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+    const float yoffset = -(tile / tiles_per_row) * datasurface_height;
 
     /*
     const float maxx = float(
@@ -215,52 +204,41 @@ draw_vertex_buffer_vec(const AcMesh& mesh,
             min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
                 model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
     */
-    const float maxx = float(
-        max(acReduceScal(RTYPE_MAX, vertex_buffer_a),
-            max(acReduceScal(RTYPE_MAX, vertex_buffer_b),
-                acReduceScal(RTYPE_MAX, vertex_buffer_c))));
-    const float minn = float(
-        min(acReduceScal(RTYPE_MIN, vertex_buffer_a),
-            min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
-                acReduceScal(RTYPE_MIN, vertex_buffer_c))));
+    const float maxx  = float(max(
+        acReduceScal(RTYPE_MAX, vertex_buffer_a),
+        max(acReduceScal(RTYPE_MAX, vertex_buffer_b), acReduceScal(RTYPE_MAX, vertex_buffer_c))));
+    const float minn  = float(min(
+        acReduceScal(RTYPE_MIN, vertex_buffer_a),
+        min(acReduceScal(RTYPE_MIN, vertex_buffer_b), acReduceScal(RTYPE_MIN, vertex_buffer_c))));
     const float range = fabsf(maxx - minn);
     const float mid   = maxx - .5f * range;
 
-    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+    const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
     for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
         for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
             ERRCHK(i < datasurface_width && j < datasurface_height);
 
             const int idx   = AC_VTXBUF_IDX(i, j, k, mesh.info);
             const uint8_t r = (uint8_t)(
-                255.f *
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
-                range);
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) / range);
             const uint8_t g = (uint8_t)(
-                255.f *
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
-                range);
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) / range);
             const uint8_t b = (uint8_t)(
-                255.f *
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) /
-                range);
-            const uint32_t mapped_color = SDL_MapRGBA(
-                surfaces[vertex_buffer_a]->format, r, g, b, 255);
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / range);
+            const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer_a]->format, r, g, b,
+                                                      255);
             set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
         }
     }
 
     const float2 pos   = (float2){xoffset, yoffset};
-    const float2 bbox  = (float2){.5f * datasurface_width,
-                                 .5f * datasurface_height};
+    const float2 bbox  = (float2){.5f * datasurface_width, .5f * datasurface_height};
     const float2 wsize = (float2){float(window_width), float(window_height)};
     const vec4 rectf   = project_ortho(pos, bbox, wsize);
-    SDL_Rect rect      = (SDL_Rect){
-        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
-        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
+    SDL_Rect rect      = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+                               int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
 
-    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
-                                                    surfaces[vertex_buffer_a]);
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer_a]);
     SDL_RenderCopy(renderer, tex, NULL, &rect);
     SDL_DestroyTexture(tex);
 
@@ -272,13 +250,11 @@ renderer_draw(const AcMesh& mesh)
 {
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
         draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
-    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
-                           NUM_VTXBUF_HANDLES);
+    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, NUM_VTXBUF_HANDLES);
 
     // Drawing done, present
     SDL_RenderPresent(renderer);
-    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b,
-                           color_bg.a);
+    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, color_bg.a);
     SDL_RenderClear(renderer);
 
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -404,14 +380,14 @@ run_renderer(void)
 
 /* Step the simulation */
 #if 1
-        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
-                                        VTXBUF_UUZ);
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
         const AcReal dt   = host_timestep(umax, mesh_info);
         acIntegrate(dt);
 #else
         ModelMesh* model_mesh = modelmesh_create(mesh->info);
-        const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
-        const AcReal dt   = host_timestep(umax, mesh_info);
+        const AcReal umax     = AcReal(
+            model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+        const AcReal dt = host_timestep(umax, mesh_info);
         acmesh_to_modelmesh(*mesh, model_mesh);
         model_rk3(dt, model_mesh);
         modelmesh_to_acmesh(*model_mesh, mesh);
@@ -425,7 +401,7 @@ run_renderer(void)
         /* Render */
         const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
         if (timer_diff_sec >= desired_frame_time) {
-            //acStore(mesh);
+            // acStore(mesh);
             const int num_vertices = mesh->info.int_params[AC_mxy];
             const int3 dst         = (int3){0, 0, k_slice};
             acStoreWithOffset(dst, num_vertices, mesh);
diff --git a/src/standalone/simulation.cc b/src/standalone/simulation.cc
index efa198a..4a4b6a4 100644
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -60,23 +60,23 @@ print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
 }
 */
 
-//Write all setting info into a separate ascii file. This is done to guarantee
-//that we have the data specifi information in the thing, even though in
-//principle these things are in the astaroth.conf.
-static inline 
-void write_mesh_info(const AcMeshInfo* config)
+// Write all setting info into a separate ascii file. This is done to guarantee
+// that we have the data specifi information in the thing, even though in
+// principle these things are in the astaroth.conf.
+static inline void
+write_mesh_info(const AcMeshInfo* config)
 {
- 
+
     FILE* infotxt;
 
-    infotxt = fopen("purge.sh","w");
+    infotxt = fopen("purge.sh", "w");
     fprintf(infotxt, "#!/bin/bash\n");
     fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
-    fclose(infotxt);   
+    fclose(infotxt);
 
-    infotxt = fopen("mesh_info.list","w");
+    infotxt = fopen("mesh_info.list", "w");
 
-    //Total grid dimensions
+    // Total grid dimensions
     fprintf(infotxt, "int  AC_mx        %i \n", config->int_params[AC_mx]);
     fprintf(infotxt, "int  AC_my        %i \n", config->int_params[AC_my]);
     fprintf(infotxt, "int  AC_mz        %i \n", config->int_params[AC_mz]);
@@ -96,30 +96,28 @@ void write_mesh_info(const AcMeshInfo* config)
     fprintf(infotxt, "real AC_inv_dsx   %e \n", (double)config->real_params[AC_inv_dsx]);
     fprintf(infotxt, "real AC_inv_dsy   %e \n", (double)config->real_params[AC_inv_dsy]);
     fprintf(infotxt, "real AC_inv_dsz   %e \n", (double)config->real_params[AC_inv_dsz]);
-    fprintf(infotxt, "real AC_dsmin     %e \n", (double)config->real_params[AC_dsmin  ]);
+    fprintf(infotxt, "real AC_dsmin     %e \n", (double)config->real_params[AC_dsmin]);
 
     /* Additional helper params */
     // Int helpers
-    fprintf(infotxt, "int  AC_mxy       %i \n", config->int_params[AC_mxy ]);
-    fprintf(infotxt, "int  AC_nxy       %i \n", config->int_params[AC_nxy ]);
+    fprintf(infotxt, "int  AC_mxy       %i \n", config->int_params[AC_mxy]);
+    fprintf(infotxt, "int  AC_nxy       %i \n", config->int_params[AC_nxy]);
     fprintf(infotxt, "int  AC_nxyz      %i \n", config->int_params[AC_nxyz]);
 
     // Real helpers
     fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
-    fprintf(infotxt, "real AC_cv_sound  %e \n", (double)config->real_params[AC_cv_sound ]);
+    fprintf(infotxt, "real AC_cv_sound  %e \n", (double)config->real_params[AC_cv_sound]);
 
     fclose(infotxt);
 }
 
-
-//This funtion writes a run state into a set of C binaries. For the sake of
-//accuracy, all floating point numbers are to be saved in long double precision
-//regardless of the choise of accuracy during runtime. 
+// This funtion writes a run state into a set of C binaries. For the sake of
+// accuracy, all floating point numbers are to be saved in long double precision
+// regardless of the choise of accuracy during runtime.
 static inline void
-save_mesh(const AcMesh &save_mesh, const int step, 
-          const AcReal t_step)
+save_mesh(const AcMesh& save_mesh, const int step, const AcReal t_step)
 {
-    FILE* save_ptr;  
+    FILE* save_ptr;
 
     for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
         const size_t n = AC_VTXBUF_SIZE(save_mesh.info);
@@ -128,7 +126,7 @@ save_mesh(const AcMesh &save_mesh, const int step,
         char cstep[10];
         char bin_filename[80] = "\0";
 
-        //sprintf(bin_filename, "");
+        // sprintf(bin_filename, "");
 
         sprintf(cstep, "%d", step);
 
@@ -139,30 +137,27 @@ save_mesh(const AcMesh &save_mesh, const int step,
 
         printf("Savefile %s \n", bin_filename);
 
-        save_ptr = fopen(bin_filename,"wb");
+        save_ptr = fopen(bin_filename, "wb");
 
-        //Start file with time stamp
-        long double write_long_buf =  (long double) t_step;
+        // Start file with time stamp
+        long double write_long_buf = (long double)t_step;
         fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
-        //Grid data
+        // Grid data
         for (size_t i = 0; i < n; ++i) {
-            const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
-            long double write_long_buf =  (long double) point_val;
+            const AcReal point_val     = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
+            long double write_long_buf = (long double)point_val;
             fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
         }
         fclose(save_ptr);
     }
-
 }
 
-
-
 // This function prints out the diagnostic values to std.out and also saves and
-// appends an ascii file to contain all the result. 
+// appends an ascii file to contain all the result.
 static inline void
-print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file)
+print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE* diag_file)
 {
-    
+
     AcReal buf_rms, buf_max, buf_min;
     const int max_name_width = 16;
 
@@ -172,20 +167,19 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
     buf_rms = acReduceVec(RTYPE_RMS, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
 
     // MV: The ordering in the earlier version was wrong in terms of variable
-    // MV: name and its diagnostics. 
+    // MV: name and its diagnostics.
     printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
-    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
-           double(buf_min), double(buf_rms), double(buf_max));
-    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), 
-           double(buf_min), double(buf_rms), double(buf_max));
-    
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", double(buf_min),
+           double(buf_rms), double(buf_max));
+    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), double(buf_min),
+            double(buf_rms), double(buf_max));
 
     // Calculate rms, min and max from the variables as scalars
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
         buf_max = acReduceScal(RTYPE_MAX, VertexBufferHandle(i));
         buf_min = acReduceScal(RTYPE_MIN, VertexBufferHandle(i));
         buf_rms = acReduceScal(RTYPE_RMS, VertexBufferHandle(i));
-        
+
         printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, vtxbuf_names[i],
                double(buf_min), double(buf_rms), double(buf_max));
         fprintf(diag_file, "%e %e %e ", double(buf_min), double(buf_rms), double(buf_max));
@@ -194,11 +188,11 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
     fprintf(diag_file, "\n");
 }
 
-    /* 
-        MV NOTE: At the moment I have no clear idea how to calculate magnetic
-        diagnostic variables from grid. Vector potential measures have a limited
-        value. TODO: Smart way to get brms, bmin and bmax.
-    */ 
+/*
+    MV NOTE: At the moment I have no clear idea how to calculate magnetic
+    diagnostic variables from grid. Vector potential measures have a limited
+    value. TODO: Smart way to get brms, bmin and bmax.
+*/
 
 int
 run_simulation(void)
@@ -213,16 +207,16 @@ run_simulation(void)
     acInit(mesh_info);
     acLoad(*mesh);
 
-
-    FILE *diag_file;
+    FILE* diag_file;
     diag_file = fopen("timeseries.ts", "a");
-    // TODO Get time from earlier state. 
+    // TODO Get time from earlier state.
     AcReal t_step = 0.0;
 
     // Generate the title row.
     fprintf(diag_file, "step  t_step  dt  uu_total_min  uu_total_rms  uu_total_max  ");
     for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]);
+        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i],
+                vtxbuf_names[i]);
     }
 
     fprintf(diag_file, "\n");
@@ -234,56 +228,54 @@ run_simulation(void)
     acStore(mesh);
     save_mesh(*mesh, 0, t_step);
 
-    const int max_steps = mesh_info.int_params[AC_max_steps];
+    const int max_steps  = mesh_info.int_params[AC_max_steps];
     const int save_steps = mesh_info.int_params[AC_save_steps];
-    const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info
+    const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; // TODO Get from mesh_info
 
     AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
     AcReal bin_crit_t = bin_save_t;
 
     /* Step the simulation */
     for (int i = 1; i < max_steps; ++i) {
-        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
-                                        VTXBUF_UUZ);
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
         const AcReal dt   = host_timestep(umax, mesh_info);
         acIntegrate(dt);
 
-        t_step += dt; 
+        t_step += dt;
 
         /* Save the simulation state and print diagnostics */
         if ((i % save_steps) == 0) {
 
             /*
-		print_diagnostics() writes out both std.out printout from the
-		results and saves the diagnostics into a table for ascii file
+                print_diagnostics() writes out both std.out printout from the
+                results and saves the diagnostics into a table for ascii file
                 timeseries.ts.
             */
 
             print_diagnostics(i, dt, t_step, diag_file);
 
             /*
-		We would also might want an XY-average calculating funtion,
-		which can be very useful when observing behaviour of turbulent
+                We would also might want an XY-average calculating funtion,
+                which can be very useful when observing behaviour of turbulent
                 simulations. (TODO)
             */
-
         }
 
         /* Save the simulation state and print diagnostics */
         if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) {
 
             /*
-		This loop saves the data into simple C binaries which can be
+                This loop saves the data into simple C binaries which can be
                 used for analysing the data snapshots closely.
- 
-                Saving simulation state should happen in a separate stage. We do 
-                not want to save it as often as diagnostics. The file format 
-                should IDEALLY be HDF5 which has become a well supported, portable and 
-		reliable data format when it comes to HPC applications.
-		However, implementing it will have to for more simpler approach
+
+                Saving simulation state should happen in a separate stage. We do
+                not want to save it as often as diagnostics. The file format
+                should IDEALLY be HDF5 which has become a well supported, portable and
+                reliable data format when it comes to HPC applications.
+                However, implementing it will have to for more simpler approach
                 to function. (TODO?)
             */
-                
+
             /*
                 The updated mesh will be located on the GPU. Also all calls
                 to the astaroth interface (functions beginning with ac*) are
@@ -299,10 +291,8 @@ run_simulation(void)
 
             save_mesh(*mesh, i, t_step);
 
-            bin_crit_t += bin_save_t; 
-
+            bin_crit_t += bin_save_t;
         }
-
     }
 
     //////Save the final snapshot
@@ -318,25 +308,3 @@ run_simulation(void)
 
     return 0;
 }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/standalone/timer_hires.h b/src/standalone/timer_hires.h
index 6b9723b..3a79419 100644
--- a/src/standalone/timer_hires.h
+++ b/src/standalone/timer_hires.h
@@ -52,8 +52,7 @@ timer_diff_nsec(const Timer start)
 {
     Timer end;
     timer_reset(&end);
-    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l +
-                      (end.tv_nsec - start.tv_nsec);
+    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + (end.tv_nsec - start.tv_nsec);
     return diff;
 }