Autoformatted all CUDA/C/C++ code

2019-06-18 16:42:56 +03:00
parent 6fdc4cddb2
commit 8864266042
12 changed files with 1053 additions and 1111 deletions
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -316,9 +316,13 @@ traverse(const ASTNode* node)
            if (symbol_table[i].type_qualifier == IN) {
                printf("const %sData %s = READ(%s%s);\n", translate(symbol_table[i].type_specifier),
                       symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
-            } else if (symbol_table[i].type_qualifier == OUT) {
+            }
-                printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+            else if (symbol_table[i].type_qualifier == OUT) {
-                //printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n", translate(symbol_table[i].type_specifier), symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
+                printf("%s %s = READ_OUT(%s%s);", translate(symbol_table[i].type_specifier),
                       symbol_table[i].identifier, inout_name_prefix, symbol_table[i].identifier);
                // printf("%s %s = buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)];\n",
                // translate(symbol_table[i].type_specifier), symbol_table[i].identifier,
                // inout_name_prefix, symbol_table[i].identifier);
            }
        }
    }
@@ -326,8 +330,7 @@ traverse(const ASTNode* node)
    // Preprocessed parameter boilerplate
    if (node->type == NODE_TYPE_QUALIFIER && node->token == PREPROCESSED)
        inside_preprocessed = true;
-    static const char
+    static const char preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
        preprocessed_parameter_boilerplate[] = "const int3 vertexIdx, ";
    if (inside_preprocessed && node->type == NODE_FUNCTION_PARAMETER_DECLARATION)
        printf("%s ", preprocessed_parameter_boilerplate);
    // BOILERPLATE END////////////////////////////////////////////////////////
@@ -343,7 +346,6 @@ traverse(const ASTNode* node)
    if (node->type == NODE_FUNCTION_DECLARATION)
        inside_function_declaration = false;
    // If the node is a subscript expression and the expression list inside it is not empty
    if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
        printf("IDX(");
@@ -354,7 +356,7 @@ traverse(const ASTNode* node)
        if (handle >= 0) { // The variable exists in the symbol table
            const Symbol* symbol = &symbol_table[handle];
-            //if (symbol->type_qualifier == OUT) {
+            // if (symbol->type_qualifier == OUT) {
            //    printf("%s%s", inout_name_prefix, symbol->identifier);
            //}
            if (symbol->type_qualifier == UNIFORM) {
@@ -394,14 +396,16 @@ traverse(const ASTNode* node)
    // Postfix logic  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    // If the node is a subscript expression and the expression list inside it is not empty
    if (node->type == NODE_MULTIDIM_SUBSCRIPT_EXPRESSION && node->rhs)
-        printf(")");    // Closing bracket of IDX()
+        printf(")"); // Closing bracket of IDX()
    // Generate writeback boilerplate for OUT fields
    if (inside_kernel && node->type == NODE_COMPOUND_STATEMENT) {
        for (int i = 0; i < num_symbols; ++i) {
            if (symbol_table[i].type_qualifier == OUT) {
-                printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
+                printf("WRITE_OUT(%s%s, %s);\n", inout_name_prefix, symbol_table[i].identifier,
-                //printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n", inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
+                       symbol_table[i].identifier);
                // printf("buffer.out[%s%s][IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z)] = %s;\n",
                // inout_name_prefix, symbol_table[i].identifier, symbol_table[i].identifier);
            }
        }
    }
@@ -486,8 +490,8 @@ generate_preprocessed_structures(void)
    for (int i = 0; i < num_symbols; ++i) {
        if (symbol_table[i].type_qualifier == PREPROCESSED)
-            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n", symbol_table[i].identifier,
+            printf("data.%s = preprocessed_%s(vertexIdx, buf[handle]);\n",
-                   symbol_table[i].identifier);
+                   symbol_table[i].identifier, symbol_table[i].identifier);
    }
    printf("return data;\n");
    printf("}\n");
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -41,7 +41,6 @@ extern "C" {
 #include <stdlib.h>       // size_t
 #include <vector_types.h> // CUDA vector types (float4, etc)
 /*
 * =============================================================================
 * Flags for auto-optimization
@@ -59,7 +58,6 @@ extern "C" {
 #define NUM_ITERATIONS (10)
 #define WARP_SIZE (32)
 /*
 * =============================================================================
 * Compile-time constants used during simulation (user definable)
@@ -75,7 +73,8 @@ extern "C" {
 // L-prefix inherited from the old Astaroth, no idea what it means
 // MV: L means a Logical switch variale, something having true of false value.
-#define LFORCING (0) // Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
+// Note: forcing is disabled currently in the files generated by acc (compiler of our DSL)
 #define LFORCING (0)
 #define LINDUCTION (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
@@ -258,28 +257,16 @@ typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
 * Reduction types
 * =============================================================================
 */
-typedef enum {
+typedef enum { RTYPE_MAX, RTYPE_MIN, RTYPE_RMS, RTYPE_RMS_EXP, NUM_REDUCTION_TYPES } ReductionType;
    RTYPE_MAX,
    RTYPE_MIN,
    RTYPE_RMS,
    RTYPE_RMS_EXP,
    NUM_REDUCTION_TYPES
 } ReductionType;
 /*
 * =============================================================================
 * Definitions for the enums and structs for AcMeshInfo (DO NOT TOUCH)
 * =============================================================================
 */
-typedef enum {
+typedef enum { AC_FOR_INT_PARAM_TYPES(AC_GEN_ID), NUM_INT_PARAM_TYPES } AcIntParam;
    AC_FOR_INT_PARAM_TYPES(AC_GEN_ID),
    NUM_INT_PARAM_TYPES
 } AcIntParam;
-typedef enum {
+typedef enum { AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID), NUM_REAL_PARAM_TYPES } AcRealParam;
    AC_FOR_REAL_PARAM_TYPES(AC_GEN_ID),
    NUM_REAL_PARAM_TYPES
 } AcRealParam;
 extern const char* intparam_names[];  // Defined in astaroth.cu
 extern const char* realparam_names[]; // Defined in astaroth.cu
@@ -294,9 +281,7 @@ typedef struct {
 * Definitions for the enums and structs for AcMesh (DO NOT TOUCH)
 * =============================================================================
 */
-typedef enum {
+typedef enum { AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES } VertexBufferHandle;
    AC_FOR_VTXBUF_HANDLES(AC_GEN_ID) NUM_VTXBUF_HANDLES
 } VertexBufferHandle;
 extern const char* vtxbuf_names[]; // Defined in astaroth.cu
@@ -316,22 +301,20 @@ typedef struct {
    AcMeshInfo info;
 } AcMesh;
-#define AC_VTXBUF_SIZE(mesh_info)                                              \
+#define AC_VTXBUF_SIZE(mesh_info)                                                                  \
-    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *      \
+    ((size_t)(mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my] *                          \
              mesh_info.int_params[AC_mz]))
-#define AC_VTXBUF_SIZE_BYTES(mesh_info)                                        \
+#define AC_VTXBUF_SIZE_BYTES(mesh_info) (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
    (sizeof(AcReal) * AC_VTXBUF_SIZE(mesh_info))
-#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                   \
+#define AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info)                                                       \
-    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] *               \
+    (mesh_info.int_params[AC_nx] * mesh_info.int_params[AC_ny] * mesh_info.int_params[AC_nz])
     mesh_info.int_params[AC_nz])
-#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                             \
+#define AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(mesh_info)                                                 \
    (sizeof(AcReal) * AC_VTXBUF_COMPDOMAIN_SIZE(mesh_info))
-#define AC_VTXBUF_IDX(i, j, k, mesh_info)                                      \
+#define AC_VTXBUF_IDX(i, j, k, mesh_info)                                                          \
-    ((i) + (j)*mesh_info.int_params[AC_mx] +                                   \
+    ((i) + (j)*mesh_info.int_params[AC_mx] +                                                       \
     (k)*mesh_info.int_params[AC_mx] * mesh_info.int_params[AC_my])
 /*
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -28,33 +28,24 @@
 #include "errchk.h"
 #include "device.cuh"
-#include "math_utils.h" // sum for reductions
+#include "math_utils.h"               // sum for reductions
 #include "standalone/config_loader.h" // update_config
-const char* intparam_names[]      = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
+const char* intparam_names[]  = {AC_FOR_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]     = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
+const char* realparam_names[] = {AC_FOR_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]        = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
+const char* vtxbuf_names[]    = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
-
+static const int MAX_NUM_DEVICES       = 32;
-static const int MAX_NUM_DEVICES = 32;
+static int num_devices                 = 1;
 static int num_devices = 1;
 static Device devices[MAX_NUM_DEVICES] = {};
 static Grid
 createGrid(const AcMeshInfo& config)
 {
    Grid grid;
    grid.m = (int3) {
        config.int_params[AC_mx],
        config.int_params[AC_my],
        config.int_params[AC_mz]
    };
-    grid.n = (int3) {
+    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
-        config.int_params[AC_nx],
+    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
        config.int_params[AC_ny],
        config.int_params[AC_nz]
    };
    return grid;
 }
@@ -71,8 +62,7 @@ gridIdx(const Grid& grid, const int i, const int j, const int k)
 static int3
 gridIdx3d(const Grid& grid, const int idx)
 {
-    return (int3){idx % grid.m.x,
+    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
                 (idx % (grid.m.x * grid.m.y)) / grid.m.x,
                  idx / (grid.m.x * grid.m.y)};
 }
@@ -119,10 +109,12 @@ acInit(const AcMeshInfo& config)
    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
-    printf("Grid m "); printInt3(grid.m); printf("\n");
+    // clang-format off
-    printf("Grid n "); printInt3(grid.n); printf("\n");
+    printf("Grid m ");   printInt3(grid.m);    printf("\n");
    printf("Grid n ");   printInt3(grid.n);    printf("\n");
    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
    // clang-format on
    // Initialize the devices
    for (int i = 0; i < num_devices; ++i) {
@@ -202,8 +194,10 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
        */
        if (db.z >= da.z) {
            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            const int3 da_local  = (int3){
-            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local); printf("\n");
+                da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
            // printf("\n");
            copyMeshToDevice(devices[i], STREAM_PRIMARY, host_mesh, da, da_local, copy_cells);
        }
        printf("\n");
@@ -236,8 +230,10 @@ acStoreWithOffset(const int3& src, const int num_vertices, AcMesh* host_mesh)
        */
        if (db.z >= da.z) {
            const int copy_cells = gridIdxx(subgrid, db) - gridIdxx(subgrid, da);
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
+            const int3 da_local  = (int3){
-            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local); printf("\n");
+                da.x, da.y, da.z - i * grid.n.z / num_devices}; // DECOMPOSITION OFFSET HERE
            // printf("\t\tcopy %d cells from local index ", copy_cells); printInt3(da_local);
            // printf("\n");
            copyMeshToHost(devices[i], STREAM_PRIMARY, da_local, da, copy_cells, host_mesh);
        }
        printf("\n");
@@ -262,10 +258,9 @@ acStore(AcMesh* host_mesh)
 AcResult
 acIntegrateStep(const int& isubstep, const AcReal& dt)
 {
-    const int3 start = (int3){STENCIL_ORDER/2, STENCIL_ORDER/2, STENCIL_ORDER/2};
+    const int3 start = (int3){STENCIL_ORDER / 2, STENCIL_ORDER / 2, STENCIL_ORDER / 2};
-    const int3 end   = (int3){STENCIL_ORDER/2 + subgrid.n.x,
+    const int3 end   = (int3){STENCIL_ORDER / 2 + subgrid.n.x, STENCIL_ORDER / 2 + subgrid.n.y,
-                              STENCIL_ORDER/2 + subgrid.n.y,
+                            STENCIL_ORDER / 2 + subgrid.n.z};
                              STENCIL_ORDER/2 + subgrid.n.z};
    for (int i = 0; i < num_devices; ++i) {
        rkStep(devices[i], STREAM_PRIMARY, isubstep, start, end, dt);
    }
@@ -278,121 +273,125 @@ acBoundcondStep(void)
 {
    acSynchronize();
    if (num_devices == 1) {
-        boundcondStep(devices[0], STREAM_PRIMARY,
+        boundcondStep(devices[0], STREAM_PRIMARY, (int3){0, 0, 0},
-                      (int3){0, 0, 0}, (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
+                      (int3){subgrid.m.x, subgrid.m.y, subgrid.m.z});
-    } else {
+    }
    else {
        // Local boundary conditions
        for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, STENCIL_ORDER/2}; // DECOMPOSITION OFFSET HERE
+            const int3 d0 = (int3){0, 0, STENCIL_ORDER / 2}; // DECOMPOSITION OFFSET HERE
            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
            boundcondStep(devices[i], STREAM_PRIMARY, d0, d1);
        }
-/*
+        /*
-// ===MIIKKANOTE START==========================================
+        // ===MIIKKANOTE START==========================================
-%JP: The old way for computing boundary conditions conflicts with the
+        %JP: The old way for computing boundary conditions conflicts with the
-way we have to do things with multiple GPUs.
+        way we have to do things with multiple GPUs.
-The older approach relied on unified memory, which represented the whole
+        The older approach relied on unified memory, which represented the whole
-memory area as one huge mesh instead of several smaller ones. However, unified memory
+        memory area as one huge mesh instead of several smaller ones. However, unified memory
-in its current state is more meant for quick prototyping when performance is not an issue.
+        in its current state is more meant for quick prototyping when performance is not an issue.
-Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
+        Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
-when managing the memory explicitly.
+        than when managing the memory explicitly.
-In this new approach, I have simplified the multi- and single-GPU layers significantly.
+        In this new approach, I have simplified the multi- and single-GPU layers significantly.
-Quick rundown:
+        Quick rundown:
-	New struct: Grid. There are two global variables, "grid" and "subgrid", which
+                New struct: Grid. There are two global variables, "grid" and "subgrid", which
-	contain the extents of the whole simulation domain and the decomposed grids, respectively.
+                contain the extents of the whole simulation domain and the decomposed grids,
-	To simplify thing, we require that each GPU is assigned the same amount of work,
+        respectively. To simplify thing, we require that each GPU is assigned the same amount of
-	therefore each GPU in the node is assigned and "subgrid.m" -sized block of data
+        work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
-	to work with.
+        work with.
-	The whole simulation domain is decomposed with respect to the z dimension.
+                The whole simulation domain is decomposed with respect to the z dimension.
-	For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
+                For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
-	contain (nx, ny, nz / num_devices) vertices.
+                contain (nx, ny, nz / num_devices) vertices.
-	An local index (i, j, k) in some subgrid can be mapped to the global grid with
+                An local index (i, j, k) in some subgrid can be mapped to the global grid with
-		global idx = (i, j, k + device_id * subgrid.n.z)
+                        global idx = (i, j, k + device_id * subgrid.n.z)
-Terminology:
+        Terminology:
-	- Single-GPU function: a function defined on the single-GPU layer (device.cu)
+                - Single-GPU function: a function defined on the single-GPU layer (device.cu)
-Changes required to this commented code block:
+        Changes required to this commented code block:
-	- The thread block dimensions (tpb) are no longer passed to the kernel here but in device.cu
+                - The thread block dimensions (tpb) are no longer passed to the kernel here but in
-	  instead. Same holds for any complex index calculations. Instead, the local coordinates
+        device.cu instead. Same holds for any complex index calculations. Instead, the local
-  	  should be passed as an int3 type without having to consider how the data is actually
+        coordinates should be passed as an int3 type without having to consider how the data is
-	  laid out in device memory
+        actually laid out in device memory
-	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
+                - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
-	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
+        handle of type "Device" which should be passed to single-GPU functions. In this file, all
-	  are stored in a global array "devices[num_devices]".
+        devices are stored in a global array "devices[num_devices]".
-	- Every single-GPU function is executed asynchronously by default such that we
+                - Every single-GPU function is executed asynchronously by default such that we
-	  can optimize Astaroth by executing memory transactions concurrently with computation.
+                  can optimize Astaroth by executing memory transactions concurrently with
-	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
+        computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
-	  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
+                  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
-	  as a parameter and commands executing in different streams can be processed
+                  as a parameter and commands executing in different streams can be processed
-	  in parallel/concurrently.
+                  in parallel/concurrently.
-Note on periodic boundaries (might be helpful when implementing other boundary conditions):
+        Note on periodic boundaries (might be helpful when implementing other boundary conditions):
-	With multiple GPUs, periodic boundary conditions applied on indices ranging from
+                With multiple GPUs, periodic boundary conditions applied on indices ranging from
-		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)
+                        (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
        STENCIL_ORDER/2)
-	on a single device are "local", in the sense that they can be computed without having
+                on a single device are "local", in the sense that they can be computed without
-	to exchange data with neighboring GPUs. Special care is needed only for transferring
+        having to exchange data with neighboring GPUs. Special care is needed only for transferring
-	the data to the fron and back plates outside this range. In the solution we use here,
+                the data to the fron and back plates outside this range. In the solution we use
-	we solve the local boundaries first, and then just exchange the front and back plates
+        here, we solve the local boundaries first, and then just exchange the front and back plates
-	in a "ring", like so
+                in a "ring", like so
-				device_id
+                                        device_id
-		    (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
+                            (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
-// ======MIIKKANOTE END==========================================
+        // ======MIIKKANOTE END==========================================
-<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
+        <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
-                                                          moved into device.cu, function boundCondStep()
+                                                                  moved into device.cu, function
-                                                          In astaroth.cu, we use acBoundcondStep()
+        boundCondStep() In astaroth.cu, we use acBoundcondStep() just to distribute the work and
-                                                          just to distribute the work and manage
+        manage communication between GPUs.
                                                          communication between GPUs.
-    printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, double(best_time) / NUM_ITERATIONS);
+            printf("Boundconds best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y,
        best_dims.z, double(best_time) / NUM_ITERATIONS);
-    exit(0);
+            exit(0);
-    #else
+            #else
-        const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
+                const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);
-        const int3 start = (int3){0, 0, device_id * depth};
+                const int3 start = (int3){0, 0, device_id * depth};
-        const int3 end = (int3){mesh_info.int_params[AC_mx],
+                const int3 end = (int3){mesh_info.int_params[AC_mx],
-                                mesh_info.int_params[AC_my],
+                                        mesh_info.int_params[AC_my],
-                                min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
+                                        min((device_id+1) * depth, mesh_info.int_params[AC_mz])};
-        const dim3 tpb(8,2,8);
+                const dim3 tpb(8,2,8);
-        // TODO uses the default stream currently
+                // TODO uses the default stream currently
-        if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
+                if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
-            wedge_boundconds(0, tpb, start, end, d_buffer);
+                    wedge_boundconds(0, tpb, start, end, d_buffer);
-        } else {
+                } else {
-            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
+                    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-                periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
+                        periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
-<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+        <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-*/
+        */
        // Exchange halos
        for (int i = 0; i < num_devices; ++i) {
-            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER/2;
+            const int num_vertices = subgrid.m.x * subgrid.m.y * STENCIL_ORDER / 2;
            // ...|ooooxxx|... -> xxx|ooooooo|...
            {
-                const int3 src = (int3) {0, 0, subgrid.n.z};
+                const int3 src = (int3){0, 0, subgrid.n.z};
-                const int3 dst = (int3) {0, 0, 0};
+                const int3 dst = (int3){0, 0, 0};
-                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src, devices[(i+1) % num_devices], dst, num_vertices);
+                copyMeshDeviceToDevice(devices[i], STREAM_PRIMARY, src,
                                       devices[(i + 1) % num_devices], dst, num_vertices);
            }
            // ...|ooooooo|xxx <- ...|xxxoooo|...
            {
-                const int3 src = (int3) {0, 0, STENCIL_ORDER/2};
+                const int3 src = (int3){0, 0, STENCIL_ORDER / 2};
-                const int3 dst = (int3) {0, 0, STENCIL_ORDER/2 + subgrid.n.z};
+                const int3 dst = (int3){0, 0, STENCIL_ORDER / 2 + subgrid.n.z};
-                copyMeshDeviceToDevice(devices[(i+1) % num_devices], STREAM_PRIMARY, src, devices[i], dst, num_vertices);
+                copyMeshDeviceToDevice(devices[(i + 1) % num_devices], STREAM_PRIMARY, src,
                                       devices[i], dst, num_vertices);
            }
        }
    }
@@ -427,26 +426,28 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
    for (int i = 1; i < n; ++i) {
        if (rtype == RTYPE_MAX) {
            res = max(res, results[i]);
-        } else if (rtype == RTYPE_MIN) {
+        }
        else if (rtype == RTYPE_MIN) {
            res = min(res, results[i]);
-        } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        }
        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
            res = sum(res, results[i]);
-        } else {
+        }
        else {
            ERROR("Invalid rtype");
        }
    }
    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res = sqrt(inv_n * res);
+        res                = sqrt(inv_n * res);
    }
    return res;
 }
 AcReal
-acReduceScal(const ReductionType& rtype,
+acReduceScal(const ReductionType& rtype, const VertexBufferHandle& vtxbuffer_handle)
             const VertexBufferHandle& vtxbuffer_handle)
 {
    AcReal results[num_devices];
    for (int i = 0; i < num_devices; ++i) {
@@ -457,8 +458,8 @@ acReduceScal(const ReductionType& rtype,
 }
 AcReal
-acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
+acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a, const VertexBufferHandle& b,
-            const VertexBufferHandle& b, const VertexBufferHandle& c)
+            const VertexBufferHandle& c)
 {
    AcReal results[num_devices];
    for (int i = 0; i < num_devices; ++i) {
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -36,7 +36,7 @@ typedef struct {
 __constant__ AcMeshInfo d_mesh_info;
 __constant__ int3 d_multigpu_offset;
 __constant__ Grid globalGrid;
-#define DCONST_INT(X)  (d_mesh_info.int_params[X])
+#define DCONST_INT(X) (d_mesh_info.int_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
@@ -76,46 +76,46 @@ printDeviceInfo(const Device device)
    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
    printf("    Stream processors: %d\n", props.multiProcessorCount);
    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
-    printf("    Compute mode: %d\n", (int)props.computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    printf(
        "    Compute mode: %d\n",
        (int)props
            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
    // Memory
    printf("  Global memory\n");
    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
-           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth /
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
               (8. * 1024. * 1024. * 1024.));
    printf("    ECC enabled: %d\n", props.ECCEnabled);
    // Memory usage
    size_t free_bytes, total_bytes;
    cudaMemGetInfo(&free_bytes, &total_bytes);
    const size_t used_bytes = total_bytes - free_bytes;
-    printf("    Total global mem: %.2f GiB\n",
+    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
           props.totalGlobalMem / (1024.0 * 1024 * 1024));
    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory free (GiB): %.2f\n",
+    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
-           free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory total (GiB): %.2f\n",
           total_bytes / (1024.0 * 1024 * 1024));
    printf("  Caches\n");
    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
-    printf("    Shared mem per block: %ld KiB\n",
+    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
           props.sharedMemPerBlock / (1024));
    printf("  Other\n");
    printf("    Warp size: %d\n", props.warpSize);
    // printf("    Single to double perf. ratio: %dx\n",
    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
    // versions
-    printf("    Stream priorities supported: %d\n",
+    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
           props.streamPrioritiesSupported);
    printf("--------------------------------------------------\n");
    return AC_SUCCESS;
 }
-static __global__ void dummy_kernel(void) {}
+static __global__ void
 dummy_kernel(void)
 {
 }
 AcResult
 createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
@@ -124,10 +124,10 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
    cudaDeviceReset();
    // Create Device
-    struct device_s* device = (struct device_s*) malloc(sizeof(*device));
+    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
    ERRCHK_ALWAYS(device);
-    device->id = id;
+    device->id           = id;
    device->local_config = device_config;
    // Check that the code was compiled for the proper GPU architecture
@@ -150,15 +150,14 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
    }
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_scratchpad,
+    ERRCHK_CUDA_ALWAYS(
-                                  AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
+        cudaMalloc(&device->reduce_scratchpad, AC_VTXBUF_COMPDOMAIN_SIZE_BYTES(device_config)));
    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
    // Device constants
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
                                          cudaMemcpyHostToDevice));
    // Multi-GPU offset. This is used to compute globalVertexIdx.
    // Might be better to calculate this in astaroth.cu instead of here, s.t.
    // everything related to the decomposition is limited to the multi-GPU layer
@@ -166,7 +165,6 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
    printf("Created device %d (%p)\n", device->id, device);
    *device_handle = device;
    return AC_SUCCESS;
@@ -211,53 +209,44 @@ reduceScal(const Device device, const StreamType stream_type, const ReductionTyp
 {
    cudaSetDevice(device->id);
-    const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
-                               device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_ny_min],
-                               device->local_config.int_params[AC_nz_min]
+                              device->local_config.int_params[AC_nz_min]};
    };
-    const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
-                             device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_ny_max],
-                             device->local_config.int_params[AC_nz_max]
+                            device->local_config.int_params[AC_nz_max]};
    };
-    *result = reduce_scal(device->streams[stream_type], rtype,
+    *result = reduce_scal(device->streams[stream_type], rtype, start, end,
-                          start, end, device->vba.in[vtxbuf_handle],
+                          device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
-                          device->reduce_scratchpad, device->reduce_result);
+                          device->reduce_result);
    return AC_SUCCESS;
 }
 AcResult
-reduceVec(const Device device, const StreamType stream_type,
+reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
-          const ReductionType rtype,
+          const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
-          const VertexBufferHandle vtxbuf0,
+          const VertexBufferHandle vtxbuf2, AcReal* result)
          const VertexBufferHandle vtxbuf1,
          const VertexBufferHandle vtxbuf2,
          AcReal* result)
 {
    cudaSetDevice(device->id);
-    const int3 start = (int3) {device->local_config.int_params[AC_nx_min],
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
-                               device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_ny_min],
-                               device->local_config.int_params[AC_nz_min]
+                              device->local_config.int_params[AC_nz_min]};
    };
-    const int3 end = (int3) {device->local_config.int_params[AC_nx_max],
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
-                             device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_ny_max],
-                             device->local_config.int_params[AC_nz_max]
+                            device->local_config.int_params[AC_nz_max]};
    };
-    *result = reduce_vec(device->streams[stream_type], rtype, start, end,
+    *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
-                          device->vba.in[vtxbuf0],
+                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
-                          device->vba.in[vtxbuf1],
+                         device->reduce_scratchpad, device->reduce_result);
                          device->vba.in[vtxbuf2],
                          device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
 }
 AcResult
-rkStep(const Device device, const StreamType stream_type, const int step_number,
+rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
-       const int3& start, const int3& end, const AcReal dt)
+       const int3& end, const AcReal dt)
 {
    cudaSetDevice(device->id);
    rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
@@ -270,65 +259,62 @@ synchronize(const Device device, const StreamType stream_type)
    cudaSetDevice(device->id);
    if (stream_type == STREAM_ALL) {
        cudaDeviceSynchronize();
-    } else {
+    }
    else {
        cudaStreamSynchronize(device->streams[stream_type]);
    }
    return AC_SUCCESS;
 }
 static AcResult
-loadWithOffset(const Device device, const StreamType stream_type,
+loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
-               const AcReal* src, const size_t bytes, AcReal* dst)
+               const size_t bytes, AcReal* dst)
 {
    cudaSetDevice(device->id);
-    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice,
+    ERRCHK_CUDA(
-                                device->streams[stream_type]));
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
    return AC_SUCCESS;
 }
 static AcResult
-storeWithOffset(const Device device, const StreamType stream_type,
+storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
-                const AcReal* src, const size_t bytes, AcReal* dst)
+                const size_t bytes, AcReal* dst)
 {
    cudaSetDevice(device->id);
-    ERRCHK_CUDA(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost,
+    ERRCHK_CUDA(
-                                device->streams[stream_type]));
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
    return AC_SUCCESS;
 }
 AcResult
-copyMeshToDevice(const Device device, const StreamType stream_type,
+copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
-                 const AcMesh& host_mesh, const int3& src, const int3& dst,
+                 const int3& src, const int3& dst, const int num_vertices)
                 const int num_vertices)
 {
    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, host_mesh.info);
    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, device->local_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx], num_vertices * sizeof(AcReal),
+        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
-                       &device->vba.in[i][dst_idx]);
+                       num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
    }
    return AC_SUCCESS;
 }
 AcResult
-copyMeshToHost(const Device device, const StreamType stream_type,
+copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
-               const int3& src, const int3& dst, const int num_vertices,
+               const int num_vertices, AcMesh* host_mesh)
               AcMesh* host_mesh)
 {
    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, device->local_config);
    const size_t dst_idx = AC_VTXBUF_IDX(dst.x, dst.y, dst.z, host_mesh->info);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
-                        num_vertices * sizeof(AcReal),
+                        num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
                        &host_mesh->vertex_buffer[i][dst_idx]);
    }
    return AC_SUCCESS;
 }
 AcResult
-copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type,
+copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
-                       const int3& src, Device dst_device, const int3& dst,
+                       Device dst_device, const int3& dst, const int num_vertices)
                       const int num_vertices)
 {
    cudaSetDevice(src_device->id);
    const size_t src_idx = AC_VTXBUF_IDX(src.x, src.y, src.z, src_device->local_config);
@@ -348,7 +334,7 @@ swapBuffers(const Device device)
 {
    cudaSetDevice(device->id);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        AcReal* tmp     = device->vba.in[i];
+        AcReal* tmp        = device->vba.in[i];
        device->vba.in[i]  = device->vba.out[i];
        device->vba.out[i] = tmp;
    }
@@ -364,8 +350,8 @@ loadDeviceConstant(const Device device, const AcIntParam param, const int value)
    // Therefore we have to obfuscate the code a bit and compute the offset address before
    // invoking cudaMemcpyToSymbol.
    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
+    ERRCHK_CUDA_ALWAYS(
-                                          offset, cudaMemcpyHostToDevice));
+        cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
    return AC_SUCCESS;
 }
@@ -374,8 +360,8 @@ loadDeviceConstant(const Device device, const AcRealParam param, const AcReal va
 {
    cudaSetDevice(device->id);
    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value),
+    ERRCHK_CUDA_ALWAYS(
-                                          offset, cudaMemcpyHostToDevice));
+        cudaMemcpyToSymbol(d_mesh_info, &value, sizeof(value), offset, cudaMemcpyHostToDevice));
    return AC_SUCCESS;
 }
@@ -383,7 +369,7 @@ AcResult
 loadGlobalGrid(const Device device, const Grid grid)
 {
    cudaSetDevice(device->id);
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid),
+    ERRCHK_CUDA_ALWAYS(
-                                          0, cudaMemcpyHostToDevice));
+        cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
    return AC_SUCCESS;
 }
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -27,12 +27,14 @@
 #pragma once
 #include "astaroth.h"
 // clang-format off
 typedef enum {
-  STREAM_PRIMARY,
+    STREAM_PRIMARY,
-  STREAM_SECONDARY,
+    STREAM_SECONDARY,
-  NUM_STREAM_TYPES,
+    NUM_STREAM_TYPES,
-  STREAM_ALL
+    STREAM_ALL
 } StreamType;
 // clang-format on
 typedef struct {
    int3 m;
@@ -52,20 +54,17 @@ AcResult createDevice(const int id, const AcMeshInfo device_config, Device* devi
 AcResult destroyDevice(Device device);
 /** */
-AcResult boundcondStep(const Device device, const StreamType stream_type,
+AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
-                       const int3& start, const int3& end);
+                       const int3& end);
 /** */
 AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
                    const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult reduceVec(const Device device, const StreamType stream_type,
+AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
-                   const ReductionType rtype,
+                   const VertexBufferHandle vec0, const VertexBufferHandle vec1,
-                   const VertexBufferHandle vec0,
+                   const VertexBufferHandle vec2, AcReal* result);
                   const VertexBufferHandle vec1,
                   const VertexBufferHandle vec2,
                   AcReal* result);
 /** */
 AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
@@ -81,9 +80,8 @@ AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
                          const int num_vertices);
 /** */
-AcResult copyMeshToHost(const Device device, const StreamType stream_type,
+AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
-                        const int3& src, const int3& dst, const int num_vertices,
+                        const int3& dst, const int num_vertices, AcMesh* host_mesh);
                        AcMesh* host_mesh);
 /** */
 AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -24,7 +24,7 @@
 * Detailed info.
 *
 */
- #pragma once
+#pragma once
 __global__ void
 kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
@@ -38,7 +38,7 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
    if (i_dst >= end.x || j_dst >= end.y || k_dst >= end.z)
        return;
-    //if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
+    // if (i_dst >= DCONST_INT(AC_mx) || j_dst >= DCONST_INT(AC_my) || k_dst >= DCONST_INT(AC_mz))
    //    return;
    // If destination index is inside the computational domain, return since
@@ -69,15 +69,15 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
    j_src += DCONST_INT(AC_ny_min);
    k_src += DCONST_INT(AC_nz_min);
-    const int src_idx      = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
+    const int src_idx = DEVICE_VTXBUF_IDX(i_src, j_src, k_src);
-    const int dst_idx      = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
+    const int dst_idx = DEVICE_VTXBUF_IDX(i_dst, j_dst, k_dst);
-    vtxbuf[dst_idx] = vtxbuf[src_idx];
+    vtxbuf[dst_idx]   = vtxbuf[src_idx];
 }
 void
 periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
 {
-    const dim3 tpb(8,2,8);
+    const dim3 tpb(8, 2, 8);
    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
@@ -89,7 +89,6 @@ periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& en
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <assert.h>
 static __device__ __forceinline__ int
 IDX(const int i)
 {
@@ -120,14 +119,12 @@ create_rotz(const AcReal radians)
    return mat;
 }
 #if AC_DOUBLE_PRECISION == 0
 #define sin __sinf
 #define cos __cosf
 #define exp __expf
 #define rsqrt rsqrtf // hardware reciprocal sqrt
-#endif // AC_DOUBLE_PRECISION == 0
+#endif               // AC_DOUBLE_PRECISION == 0
 /*
 typedef struct {
@@ -155,12 +152,11 @@ first_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 #elif STENCIL_ORDER == 6
    const AcReal coefficients[] = {0, 3.0 / 4.0, -3.0 / 20.0, 1.0 / 60.0};
 #elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0,
+    const AcReal coefficients[] = {0, 4.0 / 5.0, -1.0 / 5.0, 4.0 / 105.0, -1.0 / 280.0};
                                   -1.0 / 280.0};
 #endif
-    #define MID (STENCIL_ORDER / 2)
+#define MID (STENCIL_ORDER / 2)
-    AcReal res    = 0;
+    AcReal res = 0;
 #pragma unroll
    for (int i = 1; i <= MID; ++i)
@@ -175,17 +171,15 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 #if STENCIL_ORDER == 2
    const AcReal coefficients[] = {-2., 1.};
 #elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {-5.0/2.0, 4.0/3.0, -1.0/12.0};
+    const AcReal coefficients[] = {-5.0 / 2.0, 4.0 / 3.0, -1.0 / 12.0};
 #elif STENCIL_ORDER == 6
-    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0,
+    const AcReal coefficients[] = {-49.0 / 18.0, 3.0 / 2.0, -3.0 / 20.0, 1.0 / 90.0};
                                   1.0 / 90.0};
 #elif STENCIL_ORDER == 8
-    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0,
+    const AcReal coefficients[] = {-205.0 / 72.0, 8.0 / 5.0, -1.0 / 5.0, 8.0 / 315.0, -1.0 / 560.0};
                                   8.0 / 315.0, -1.0 / 560.0};
 #endif
-    #define MID (STENCIL_ORDER / 2)
+#define MID (STENCIL_ORDER / 2)
-    AcReal res    = coefficients[0] * pencil[MID];
+    AcReal res = coefficients[0] * pencil[MID];
 #pragma unroll
    for (int i = 1; i <= MID; ++i)
@@ -196,31 +190,29 @@ second_derivative(const AcReal* __restrict__ pencil, const AcReal inv_ds)
 /** inv_ds: inverted mesh spacing f.ex. 1. / mesh.int_params[AC_dsx] */
 static __device__ __forceinline__ AcReal
-cross_derivative(const AcReal* __restrict__ pencil_a,
+cross_derivative(const AcReal* __restrict__ pencil_a, const AcReal* __restrict__ pencil_b,
-                 const AcReal* __restrict__ pencil_b, const AcReal inv_ds_a,
+                 const AcReal inv_ds_a, const AcReal inv_ds_b)
                 const AcReal inv_ds_b)
 {
 #if STENCIL_ORDER == 2
    const AcReal coefficients[] = {0, 1.0 / 4.0};
 #elif STENCIL_ORDER == 4
-    const AcReal coefficients[] = {0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
+    const AcReal coefficients[] = {
        0, 1.0 / 32.0, 1.0 / 64.0}; // TODO correct coefficients, these are just placeholders
 #elif STENCIL_ORDER == 6
    const AcReal fac            = (1. / 720.);
-    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac,
+    const AcReal coefficients[] = {0.0 * fac, 270.0 * fac, -27.0 * fac, 2.0 * fac};
                                   2.0 * fac};
 #elif STENCIL_ORDER == 8
    const AcReal fac            = (1. / 20160.);
-    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac,
+    const AcReal coefficients[] = {0.0 * fac, 8064. * fac, -1008. * fac, 128. * fac, -9. * fac};
                                   128. * fac, -9. * fac};
 #endif
-    #define MID (STENCIL_ORDER / 2)
+#define MID (STENCIL_ORDER / 2)
-    AcReal res    = AcReal(0.);
+    AcReal res = AcReal(0.);
-    #pragma unroll
+#pragma unroll
    for (int i = 1; i <= MID; ++i) {
-        res += coefficients[i] * (pencil_a[MID + i] + pencil_a[MID - i] -
+        res += coefficients[i] *
-                                  pencil_b[MID + i] - pencil_b[MID - i]);
+               (pencil_a[MID + i] + pencil_a[MID - i] - pencil_b[MID + i] - pencil_b[MID - i]);
    }
    return res * inv_ds_a * inv_ds_b;
 }
@@ -231,7 +223,8 @@ derx(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
                                 vertexIdx.z)];
    return first_derivative(pencil, DCONST_REAL(AC_inv_dsx));
 }
@@ -242,7 +235,8 @@ derxx(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
                                 vertexIdx.z)];
    return second_derivative(pencil, DCONST_REAL(AC_inv_dsx));
 }
@@ -262,8 +256,7 @@ derxy(const int3 vertexIdx, const AcReal* __restrict__ arr)
        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2,
                                   vertexIdx.y + STENCIL_ORDER / 2 - offset, vertexIdx.z)];
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsy));
                            DCONST_REAL(AC_inv_dsy));
 }
 static __device__ __forceinline__ AcReal
@@ -281,8 +274,7 @@ derxz(const int3 vertexIdx, const AcReal* __restrict__ arr)
        pencil_b[offset] = arr[IDX(vertexIdx.x + offset - STENCIL_ORDER / 2, vertexIdx.y,
                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx),
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsx), DCONST_REAL(AC_inv_dsz));
                            DCONST_REAL(AC_inv_dsz));
 }
 static __device__ __forceinline__ AcReal
@@ -291,7 +283,8 @@ dery(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
                                 vertexIdx.z)];
    return first_derivative(pencil, DCONST_REAL(AC_inv_dsy));
 }
@@ -302,7 +295,8 @@ deryy(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2, vertexIdx.z)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
                                 vertexIdx.z)];
    return second_derivative(pencil, DCONST_REAL(AC_inv_dsy));
 }
@@ -322,8 +316,7 @@ deryz(const int3 vertexIdx, const AcReal* __restrict__ arr)
        pencil_b[offset] = arr[IDX(vertexIdx.x, vertexIdx.y + offset - STENCIL_ORDER / 2,
                                   vertexIdx.z + STENCIL_ORDER / 2 - offset)];
-    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy),
+    return cross_derivative(pencil_a, pencil_b, DCONST_REAL(AC_inv_dsy), DCONST_REAL(AC_inv_dsz));
                            DCONST_REAL(AC_inv_dsz));
 }
 static __device__ __forceinline__ AcReal
@@ -332,7 +325,8 @@ derz(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
                                 vertexIdx.z + offset - STENCIL_ORDER / 2)];
    return first_derivative(pencil, DCONST_REAL(AC_inv_dsz));
 }
@@ -343,7 +337,8 @@ derzz(const int3 vertexIdx, const AcReal* __restrict__ arr)
    AcReal pencil[STENCIL_ORDER + 1];
 #pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z + offset - STENCIL_ORDER / 2)];
+        pencil[offset] = arr[IDX(vertexIdx.x, vertexIdx.y,
                                 vertexIdx.z + offset - STENCIL_ORDER / 2)];
    return second_derivative(pencil, DCONST_REAL(AC_inv_dsz));
 }
@@ -401,8 +396,7 @@ operator-(const AcReal3& a)
    return (AcReal3){-a.x, -a.y, -a.z};
 }
-static __host__  __device__ __forceinline__ AcReal3
+static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b)
 operator*(const AcReal a, const AcReal3& b)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
@@ -443,7 +437,6 @@ is_valid(const AcReal3& a)
    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
 }
 /*
 * =============================================================================
 * Level 1 (Stencil Processing Stage)
@@ -476,8 +469,7 @@ laplace_vec(const AcReal3Data& vec)
 static __device__ __forceinline__ AcReal3
 curl(const AcReal3Data& vec)
 {
-    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z,
+    return (AcReal3){gradient(vec.z).y - gradient(vec.y).z, gradient(vec.x).z - gradient(vec.z).x,
                     gradient(vec.x).z - gradient(vec.z).x,
                     gradient(vec.y).x - gradient(vec.x).y};
 }
@@ -520,7 +512,7 @@ contract(const AcMatrix& mat)
 {
    AcReal res = 0;
-    #pragma unroll
+#pragma unroll
    for (int i = 0; i < 3; ++i)
        res += dot(mat.row[i], mat.row[i]);
@@ -558,12 +550,13 @@ __constant__ AcReal forcing_phi;
 static __device__ __forceinline__ AcReal3
 forcing(const int i, const int j, const int k)
 {
-    #define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
+#define DOMAIN_SIZE_X (DCONST_INT(AC_nx) * DCONST_REAL(AC_dsx))
-    #define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
+#define DOMAIN_SIZE_Y (DCONST_INT(AC_ny) * DCONST_REAL(AC_dsy))
-    #define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
+#define DOMAIN_SIZE_Z (DCONST_INT(AC_nz) * DCONST_REAL(AC_dsz))
-    const AcReal3 k_vec = (AcReal3){(i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
+    const AcReal3 k_vec = (AcReal3){
-                                    (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
+        (i - DCONST_INT(AC_nx_min)) * DCONST_REAL(AC_dsx) - AcReal(.5) * DOMAIN_SIZE_X,
-                                    (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
+        (j - DCONST_INT(AC_ny_min)) * DCONST_REAL(AC_dsy) - AcReal(.5) * DOMAIN_SIZE_Y,
        (k - DCONST_INT(AC_nz_min)) * DCONST_REAL(AC_dsz) - AcReal(.5) * DOMAIN_SIZE_Z};
    AcReal inv_len = reciprocal_len(k_vec);
    if (isnan(inv_len) || isinf(inv_len))
        inv_len = 0;
@@ -571,46 +564,41 @@ forcing(const int i, const int j, const int k)
        inv_len = 2;
    const AcReal k_dot_x = dot(k_vec, forcing_vec);
-    const AcReal waves = cos(k_dot_x)*cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
+    const AcReal waves = cos(k_dot_x) * cos(forcing_phi) - sin(k_dot_x) * sin(forcing_phi);
    return inv_len * inv_len * waves * forcing_vec;
 }
-
+// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values
-// Note: LNT0 and LNRHO0 must be set very carefully: if the magnitude is different that other values in the mesh, then we will inherently lose precision
+// in the mesh, then we will inherently lose precision
 #define LNT0 (AcReal(0.0))
 #define LNRHO0 (AcReal(0.0))
 #define H_CONST (AcReal(0.0))
 #define C_CONST (AcReal(0.0))
 template <int step_number>
 static __device__ __forceinline__ AcReal
-rk3_integrate(const AcReal state_previous, const AcReal state_current,
+rk3_integrate(const AcReal state_previous, const AcReal state_current, const AcReal rate_of_change,
-              const AcReal rate_of_change, const AcReal dt)
+              const AcReal dt)
 {
    // Williamson (1980)
    const AcReal alpha[] = {0, AcReal(.0), AcReal(-5. / 9.), AcReal(-153. / 128.)};
-    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.),
+    const AcReal beta[]  = {0, AcReal(1. / 3.), AcReal(15. / 16.), AcReal(8. / 15.)};
                           AcReal(8. / 15.)};
    // Note the indexing: +1 to avoid an unnecessary warning about "out-of-bounds"
    // access (when accessing beta[step_number-1] even when step_number >= 1)
    switch (step_number) {
-        case 0:
+    case 0:
-            return state_current + beta[step_number + 1] * rate_of_change * dt;
+        return state_current + beta[step_number + 1] * rate_of_change * dt;
-        case 1: // Fallthrough
+    case 1: // Fallthrough
-        case 2:
+    case 2:
-            return state_current +
+        return state_current +
-               beta[step_number + 1] *
+               beta[step_number + 1] * (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
-                   (alpha[step_number + 1] * (AcReal(1.) / beta[step_number]) *
+                                            (state_current - state_previous) +
-                        (state_current - state_previous) +
+                                        rate_of_change * dt);
-                    rate_of_change * dt);
+    default:
-        default:
+        return NAN;
            return NAN;
    }
 }
 /*
@@ -646,13 +634,14 @@ static __device__ __forceinline__ AcReal3
 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
              const AcReal3 rate_of_change, const AcReal dt)
 {
-    return (AcReal3) { rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
+    return (AcReal3){
-                                       rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
+        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
-                                       rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
 }
-#define rk3(state_previous, state_current, rate_of_change, dt)\
+#define rk3(state_previous, state_current, rate_of_change, dt)                                     \
-rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
+    rk3_integrate<step_number>(state_previous, value(state_current), rate_of_change, dt)
 /*
 template <int step_number>
@@ -708,9 +697,8 @@ read_out(const int idx, AcReal* __restrict__ field[], const int handle)
 static __device__ AcReal3
 read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 {
-    return (AcReal3) { read_out(idx, field, handle.x),
+    return (AcReal3){read_out(idx, field, handle.x), read_out(idx, field, handle.y),
-                                       read_out(idx, field, handle.y),
+                     read_out(idx, field, handle.z)};
                                       read_out(idx, field, handle.z) };
 }
 #define WRITE_OUT(handle, value) (write(buffer.out, handle, idx, value))
@@ -718,29 +706,28 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 #define READ_OUT(handle) (read_out(idx, buffer.out, handle))
 // also write for clarity here also, not for the DSL
-//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the function
+//#define WRITE(HANDLE) (write(idx, ...)) s.t. we don't have to insert boilerplat in the mid of the
 // function
-#define GEN_KERNEL_PARAM_BOILERPLATE \
+#define GEN_KERNEL_PARAM_BOILERPLATE const int3 start, const int3 end, VertexBufferArray buffer
        const int3 start, const int3 end, VertexBufferArray buffer
-#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE() \
+#define GEN_KERNEL_BUILTIN_VARIABLES_BOILERPLATE()                                                 \
-        const int3 vertexIdx = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,\
+    const int3 vertexIdx       = (int3){threadIdx.x + blockIdx.x * blockDim.x + start.x,           \
-                                                            threadIdx.y + blockIdx.y * blockDim.y + start.y,\
+                                  threadIdx.y + blockIdx.y * blockDim.y + start.y,           \
-                                                            threadIdx.z + blockIdx.z * blockDim.z + start.z};\
+                                  threadIdx.z + blockIdx.z * blockDim.z + start.z};          \
-        const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x, \
+    const int3 globalVertexIdx = (int3){d_multigpu_offset.x + vertexIdx.x,                         \
-                                            d_multigpu_offset.y + vertexIdx.y, \
+                                        d_multigpu_offset.y + vertexIdx.y,                         \
-                                            d_multigpu_offset.z + vertexIdx.z}; \
+                                        d_multigpu_offset.z + vertexIdx.z};                        \
-        if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)\
+    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
-            return;\
+        return;                                                                                    \
-\
+                                                                                                   \
-\
+    assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&           \
-        assert(vertexIdx.x < DCONST_INT(AC_nx_max) && vertexIdx.y < DCONST_INT(AC_ny_max) &&\
+           vertexIdx.z < DCONST_INT(AC_nz_max));                                                   \
-               vertexIdx.z < DCONST_INT(AC_nz_max));\
+                                                                                                   \
-\
+    assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&         \
-        assert(vertexIdx.x >= DCONST_INT(AC_nx_min) && vertexIdx.y >= DCONST_INT(AC_ny_min) &&\
+           vertexIdx.z >= DCONST_INT(AC_nz_min));                                                  \
-               vertexIdx.z >= DCONST_INT(AC_nz_min));\
+                                                                                                   \
-\
+    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
        const int idx          = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
 #include "stencil_process.cuh"
@@ -757,33 +744,31 @@ randf(void)
 }
 AcResult
-rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
+rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start,
-               const AcReal dt, VertexBufferArray* buffer)
+               const int3& end, const AcReal dt, VertexBufferArray* buffer)
 {
    const dim3 tpb(32, 1, 4);
-    /////////////////// Forcing
+/////////////////// Forcing
-    #if LFORCING
+#if LFORCING
    const AcReal ff_scale = AcReal(.2);
-    static AcReal3 ff = ff_scale * (AcReal3){1, 0, 0};
+    static AcReal3 ff     = ff_scale * (AcReal3){1, 0, 0};
-    const AcReal radians = randf() * AcReal(2*M_PI) / 360 / 8;
+    const AcReal radians  = randf() * AcReal(2 * M_PI) / 360 / 8;
-    const AcMatrix rotz = create_rotz(radians);
+    const AcMatrix rotz   = create_rotz(radians);
-    ff = mul(rotz, ff);
+    ff                    = mul(rotz, ff);
    cudaMemcpyToSymbolAsync(forcing_vec, &ff, sizeof(ff), 0, cudaMemcpyHostToDevice, stream);
-    const AcReal ff_phi = AcReal(M_PI);//AcReal(2 * M_PI) * randf();
+    const AcReal ff_phi = AcReal(M_PI); // AcReal(2 * M_PI) * randf();
-    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice, stream);
+    cudaMemcpyToSymbolAsync(forcing_phi, &ff_phi, sizeof(ff_phi), 0, cudaMemcpyHostToDevice,
-    #endif // LFORCING
+                            stream);
 #endif // LFORCING
    //////////////////////////
    const int nx = end.x - start.x;
    const int ny = end.y - start.y;
    const int nz = end.z - start.z;
-    const dim3 bpg(
+    const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)),
-        (unsigned int)ceil(nx / AcReal(tpb.x)),
+                   (unsigned int)ceil(nz / AcReal(tpb.z)));
        (unsigned int)ceil(ny / AcReal(tpb.y)),
        (unsigned int)ceil(nz / AcReal(tpb.z)));
    if (step_number == 0)
        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
@@ -796,7 +781,6 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
    return AC_SUCCESS;
 }
 ////////////////REDUCE///////////////////////////
 #include "src/core/math_utils.h" // is_power_of_two
@@ -848,22 +832,19 @@ template <FilterFunc filter>
 __global__ void
 kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end, AcReal* dst)
 {
-    const int3 src_idx = (int3) {
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-        start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-        start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
        start.z + threadIdx.z + blockIdx.z * blockDim.z
    };
-    const int nx = end.x - start.x;
+    const int nx       = end.x - start.x;
-    const int ny = end.y - start.y;
+    const int ny       = end.y - start.y;
-    const int nz = end.z - start.z; //MV: Added this because it was undefined
+    const int nz       = end.z - start.z; // MV: Added this because it was undefined
-    const int3 dst_idx = (int3) {
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-        threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
-        threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
        threadIdx.z + blockIdx.z * blockDim.z
    };
-    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
+    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
           src_idx.z < DCONST_INT(AC_nz_max));
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
@@ -872,31 +853,27 @@ kernel_filter(const __restrict__ AcReal* src, const int3 start, const int3 end,
 template <FilterFuncVec filter>
 __global__ void
-kernel_filter_vec(const __restrict__ AcReal* src0,
+kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
-                  const __restrict__ AcReal* src1,
+                  const __restrict__ AcReal* src2, const int3 start, const int3 end, AcReal* dst)
                  const __restrict__ AcReal* src2,
                  const int3 start, const int3 end, AcReal* dst)
 {
-    const int3 src_idx = (int3) {
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
-        start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
-        start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
        start.z + threadIdx.z + blockIdx.z * blockDim.z
    };
-    const int nx = end.x - start.x;
+    const int nx       = end.x - start.x;
-    const int ny = end.y - start.y;
+    const int ny       = end.y - start.y;
-    const int nz = end.z - start.z; //MV: Added this because it was undefined
+    const int nz       = end.z - start.z; // MV: Added this because it was undefined
-    const int3 dst_idx = (int3) {
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
-        threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
-        threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
        threadIdx.z + blockIdx.z * blockDim.z
    };
-    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) && src_idx.z < DCONST_INT(AC_nz_max));
+    assert(src_idx.x < DCONST_INT(AC_nx_max) && src_idx.y < DCONST_INT(AC_ny_max) &&
           src_idx.z < DCONST_INT(AC_nz_max));
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
 }
 template <ReduceFunc reduce>
@@ -908,7 +885,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
    extern __shared__ AcReal smem[];
    if (idx < num_elems) {
        smem[threadIdx.x] = scratchpad[idx];
-    } else {
+    }
    else {
        smem[threadIdx.x] = NAN;
    }
    __syncthreads();
@@ -930,9 +908,8 @@ kernel_reduce(AcReal* scratchpad, const int num_elems)
 template <ReduceFunc reduce>
 __global__ void
-kernel_reduce_block(const __restrict__ AcReal* scratchpad,
+kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
-                    const int num_blocks, const int block_size,
+                    const int block_size, AcReal* result)
                    AcReal* result)
 {
    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx != 0) {
@@ -946,23 +923,19 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad,
    *result = res;
 }
 AcReal
-reduce_scal(const cudaStream_t stream, const ReductionType rtype,
+reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-            const int3& start, const int3& end,
+            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
            const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
 {
-    const unsigned nx = end.x - start.x;
+    const unsigned nx        = end.x - start.x;
-    const unsigned ny = end.y - start.y;
+    const unsigned ny        = end.y - start.y;
-    const unsigned nz = end.z - start.z;
+    const unsigned nz        = end.z - start.z;
    const unsigned num_elems = nx * ny * nz;
    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter(
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-        (unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-        (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
        (unsigned int)ceil(nz / AcReal(tpb_filter.z))
    );
    const int tpb_reduce = 128;
    const int bpg_reduce = num_elems / tpb_reduce;
@@ -974,22 +947,38 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
    ERRCHK(nx * ny * nz % 2 == 0);
    if (rtype == RTYPE_MAX) {
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_filter<dvalue>
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
-    } else if (rtype == RTYPE_MIN) {
+            scratchpad, num_elems);
-        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce_block<dmax>
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
-    } else if (rtype == RTYPE_RMS) {
+    else if (rtype == RTYPE_MIN) {
-        kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_filter<dvalue>
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
-    } else if (rtype == RTYPE_RMS_EXP) {
+            scratchpad, num_elems);
-        kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce_block<dmin>
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
-    } else {
+    else if (rtype == RTYPE_RMS) {
        kernel_filter<dsquared>
            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
            scratchpad, num_elems);
        kernel_reduce_block<dsum>
            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    }
    else if (rtype == RTYPE_RMS_EXP) {
        kernel_filter<dexp_squared>
            <<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
            scratchpad, num_elems);
        kernel_reduce_block<dsum>
            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    }
    else {
        ERROR("Unrecognized rtype");
    }
    AcReal result;
@@ -998,22 +987,19 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype,
 }
 AcReal
-reduce_vec(const cudaStream_t stream, const ReductionType rtype,
+reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
-            const int3& start, const int3& end,
+           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
-            const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2,
+           AcReal* reduce_result)
            AcReal* scratchpad, AcReal* reduce_result)
 {
-    const unsigned nx = end.x - start.x;
+    const unsigned nx        = end.x - start.x;
-    const unsigned ny = end.y - start.y;
+    const unsigned ny        = end.y - start.y;
-    const unsigned nz = end.z - start.z;
+    const unsigned nz        = end.z - start.z;
    const unsigned num_elems = nx * ny * nz;
    const dim3 tpb_filter(32, 4, 1);
-    const dim3 bpg_filter(
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
-        (unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
-        (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
        (unsigned int)ceil(nz / AcReal(tpb_filter.z))
    );
    const int tpb_reduce = 128;
    const int bpg_reduce = num_elems / tpb_reduce;
@@ -1025,22 +1011,38 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype,
    ERRCHK(nx * ny * nz % 2 == 0);
    if (rtype == RTYPE_MAX) {
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
-        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
-    } else if (rtype == RTYPE_MIN) {
+            scratchpad, num_elems);
-        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce_block<dmax>
-        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
-    } else if (rtype == RTYPE_RMS) {
+    else if (rtype == RTYPE_MIN) {
-        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
-    } else if (rtype == RTYPE_RMS_EXP) {
+            scratchpad, num_elems);
-        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce_block<dmin>
-        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
-        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    }
-    } else {
+    else if (rtype == RTYPE_RMS) {
        kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
            scratchpad, num_elems);
        kernel_reduce_block<dsum>
            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    }
    else if (rtype == RTYPE_RMS_EXP) {
        kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
            vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
            scratchpad, num_elems);
        kernel_reduce_block<dsum>
            <<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
    }
    else {
        ERROR("Unrecognized rtype");
    }
    AcReal result;
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -35,10 +35,10 @@
 #include "model/model_rk3.h"
 #include "timer_hires.h"
-#include <vector>
+#include "src/core/errchk.h"
 #include <algorithm>
 #include <math.h>
-#include "src/core/errchk.h"
+#include <vector>
 static bool
 smaller_than(const double& a, const double& b)
@@ -47,7 +47,8 @@ smaller_than(const double& a, const double& b)
 }
 static int
-write_runningtimes(const char* path, const int n, const double min, const double max, const double median, const double perc)
+write_runningtimes(const char* path, const int n, const double min, const double max,
                   const double median, const double perc)
 {
    FILE* fp;
    fp = fopen(path, "a");
@@ -80,7 +81,8 @@ int
 run_benchmark(void)
 {
    char runningtime_path[256];
-    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+    sprintf(runningtime_path, "%s_%s_runningtimes.out", AC_DOUBLE_PRECISION ? "double" : "float",
            GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
    FILE* fp;
    fp = fopen(runningtime_path, "w");
@@ -88,13 +90,14 @@ run_benchmark(void)
    if (fp != NULL) {
        fprintf(fp, "n, min, max, median, perc\n");
        fclose(fp);
-    } else {
+    }
    else {
        return EXIT_FAILURE;
    }
-    #define N_STEP_SIZE (128)
+#define N_STEP_SIZE (128)
-    #define MAX_MESH_DIM (128)
+#define MAX_MESH_DIM (128)
-    #define NUM_ITERS (100)
+#define NUM_ITERS (100)
    for (int n = N_STEP_SIZE; n <= MAX_MESH_DIM; n += N_STEP_SIZE) {
        /* Parse configs */
        AcMeshInfo mesh_info;
@@ -113,7 +116,6 @@ run_benchmark(void)
        std::vector<double> results;
        results.reserve(NUM_ITERS);
        // Warmup
        for (int i = 0; i < 10; ++i) {
            acIntegrate(0);
@@ -124,28 +126,35 @@ run_benchmark(void)
        for (int i = 0; i < NUM_ITERS; ++i) {
            timer_reset(&t);
-            #if GEN_BENCHMARK_RK3 == 1
+#if GEN_BENCHMARK_RK3 == 1
            acIntegrateStep(2, FLT_EPSILON);
-            #else // GEN_BENCHMARK_FULL
+#else // GEN_BENCHMARK_FULL
-            //const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+            // const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-            const AcReal dt   = AcReal(1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
+            const AcReal dt = AcReal(
                1e-2); // TODO adaptive timestep //host_timestep(umax, mesh_info);
            acIntegrate(dt);
-            #endif
+#endif
            acSynchronize();
            const double ms_elapsed = timer_diff_nsec(t) / 1e6;
            results.push_back(ms_elapsed);
        }
-        #define NTH_PERCENTILE (0.95)
+#define NTH_PERCENTILE (0.95)
        std::sort(results.begin(), results.end(), smaller_than);
-        write_runningtimes(runningtime_path, n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+        write_runningtimes(runningtime_path, n, results[0], results[results.size() - 1],
                           results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
        char percentile_path[256];
-        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n, AC_DOUBLE_PRECISION ? "double" : "float", GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
+        sprintf(percentile_path, "%d_%s_%s_percentiles.out", n,
                AC_DOUBLE_PRECISION ? "double" : "float",
                GEN_BENCHMARK_RK3 ? "rk3substep" : "fullstep");
        write_percentiles(percentile_path, NUM_ITERS, results);
-        printf("%s running time %g ms, (%dth percentile, nx = %d) \n", GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep", double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100), mesh_info.int_params[AC_nx]);
+        printf("%s running time %g ms, (%dth percentile, nx = %d) \n",
               GEN_BENCHMARK_RK3 ? "RK3 step" : "Fullstep",
               double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
               mesh_info.int_params[AC_nx]);
        acStore(mesh);
        acQuit();
@@ -225,7 +234,8 @@ run_benchmark(void)
    return 0;
 }
-#else //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
+#else
 //////////////////////////////////////////////////////////////////////////GENERATE_BENCHMARK_DATA
@@ -290,8 +300,8 @@ run_benchmark(void)
        #define NTH_PERCENTILE (0.95)
        std::sort(results.begin(), results.end(), smaller_than);
-        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)], results[int(NTH_PERCENTILE * NUM_ITERS)]);
+        write_result(n, results[0], results[results.size()-1], results[int(0.5 * NUM_ITERS)],
-        write_percentiles(n, NUM_ITERS, results);
+results[int(NTH_PERCENTILE * NUM_ITERS)]); write_percentiles(n, NUM_ITERS, results);
    }
    return 0;
--- a/src/standalone/config_loader.cc
+++ b/src/standalone/config_loader.cc
@@ -79,8 +79,7 @@ parse_config(const char* path, AcMeshInfo* config)
        int idx = -1;
        if ((idx = find_str(keyword, intparam_names, NUM_INT_PARAM_TYPES)) >= 0)
            config->int_params[idx] = atoi(value);
-        else if ((idx = find_str(keyword, realparam_names,
+        else if ((idx = find_str(keyword, realparam_names, NUM_REAL_PARAM_TYPES)) >= 0)
                                 NUM_REAL_PARAM_TYPES)) >= 0)
            config->real_params[idx] = AcReal(atof(value));
    }
@@ -92,32 +91,30 @@ update_config(AcMeshInfo* config)
 {
    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
    ///////////// PAD TEST
-    //config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
    ///////////// PAD TEST
    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] +
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
                                    config->int_params[AC_nx];
    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_ny_max] = config->int_params[AC_ny] +
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
                                    STENCIL_ORDER / 2;
    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nz_max] = config->int_params[AC_nz] +
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
                                    STENCIL_ORDER / 2;
    // Spacing
    config->real_params[AC_inv_dsx] = AcReal(1.) / config->real_params[AC_dsx];
    config->real_params[AC_inv_dsy] = AcReal(1.) / config->real_params[AC_dsy];
    config->real_params[AC_inv_dsz] = AcReal(1.) / config->real_params[AC_dsz];
-    config->real_params[AC_dsmin] = min(config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
+    config->real_params[AC_dsmin]   = min(
        config->real_params[AC_dsx], min(config->real_params[AC_dsy], config->real_params[AC_dsz]));
    // Real grid coordanates (DEFINE FOR GRID WITH THE GHOST ZONES)
-    config->real_params[AC_xlen] = config->real_params[AC_dsx]*config->int_params[AC_mx]; 
+    config->real_params[AC_xlen] = config->real_params[AC_dsx] * config->int_params[AC_mx];
-    config->real_params[AC_ylen] = config->real_params[AC_dsy]*config->int_params[AC_my];
+    config->real_params[AC_ylen] = config->real_params[AC_dsy] * config->int_params[AC_my];
-    config->real_params[AC_zlen] = config->real_params[AC_dsz]*config->int_params[AC_mz];
+    config->real_params[AC_zlen] = config->real_params[AC_dsz] * config->int_params[AC_mz];
    config->real_params[AC_xorig] = AcReal(.5) * config->real_params[AC_xlen];
    config->real_params[AC_yorig] = AcReal(.5) * config->real_params[AC_ylen];
@@ -125,35 +122,35 @@ update_config(AcMeshInfo* config)
    /* Additional helper params */
    // Int helpers
-    config->int_params[AC_mxy] = config->int_params[AC_mx] *
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
-                                 config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
-    config->int_params[AC_nxy] = config->int_params[AC_nx] *
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
                                 config->int_params[AC_ny];
    config->int_params[AC_nxyz] = config->int_params[AC_nxy] *
                                  config->int_params[AC_nz];
    // Real helpers
    config->real_params[AC_cs2_sound] = config->real_params[AC_cs_sound] *
                                        config->real_params[AC_cs_sound];
-    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] / config->real_params[AC_gamma];
+    config->real_params[AC_cv_sound] = config->real_params[AC_cp_sound] /
                                       config->real_params[AC_gamma];
-    AcReal G_CONST_CGS = AcReal(6.674e-8); // g/cm3/s GGS definition //TODO define in a separate module
+    AcReal G_CONST_CGS = AcReal(
-    AcReal M_sun       = AcReal(1.989e33);  // g solar mass
+        6.674e-8);                   // g/cm3/s GGS definition //TODO define in a separate module
    AcReal M_sun = AcReal(1.989e33); // g solar mass
-    config->real_params[AC_M_star] = config->real_params[AC_M_star]*M_sun / 
+    config->real_params[AC_M_star] = config->real_params[AC_M_star] * M_sun /
-                                     ( (config->real_params[AC_unit_length]*
+                                     ((config->real_params[AC_unit_length] *
-                                        config->real_params[AC_unit_length]*
+                                       config->real_params[AC_unit_length] *
-                                        config->real_params[AC_unit_length]) * 
+                                       config->real_params[AC_unit_length]) *
-                                        config->real_params[AC_unit_density] ) ;
+                                      config->real_params[AC_unit_density]);
-    config->real_params[AC_G_CONST] = G_CONST_CGS / 
+    config->real_params[AC_G_CONST] = G_CONST_CGS / ((config->real_params[AC_unit_velocity] *
-                                      ( (config->real_params[AC_unit_velocity]*config->real_params[AC_unit_velocity]) /
+                                                      config->real_params[AC_unit_velocity]) /
-                                        (config->real_params[AC_unit_density] *config->real_params[AC_unit_length]) ) ;
+                                                     (config->real_params[AC_unit_density] *
-
+                                                      config->real_params[AC_unit_length]));
    config->real_params[AC_GM_star]  = config->real_params[AC_M_star]*config->real_params[AC_G_CONST];
    config->real_params[AC_sq2GM_star]  = AcReal(sqrt(AcReal(2)*config->real_params[AC_GM_star]));
    config->real_params[AC_GM_star] = config->real_params[AC_M_star] *
                                      config->real_params[AC_G_CONST];
    config->real_params[AC_sq2GM_star] = AcReal(sqrt(AcReal(2) * config->real_params[AC_GM_star]));
    const bool print_config = true;
    if (print_config) {
--- a/src/standalone/renderer.cc
+++ b/src/standalone/renderer.cc
@@ -50,12 +50,12 @@ static const int window_bpp = 32; // Bits per pixel
 SDL_Surface* surfaces[NUM_VTXBUF_HANDLES];
 static int datasurface_width  = -1;
 static int datasurface_height = -1;
-static int k_slice = 0;
+static int k_slice            = 0;
-static int k_slice_max = 0;
+static int k_slice_max        = 0;
 // Colors
-static SDL_Color color_bg = (SDL_Color){30, 30, 35, 255};
+static SDL_Color color_bg      = (SDL_Color){30, 30, 35, 255};
-static const int num_tiles = NUM_VTXBUF_HANDLES + 1;
+static const int num_tiles     = NUM_VTXBUF_HANDLES + 1;
 static const int tiles_per_row = 3;
 /*
@@ -82,10 +82,9 @@ static Camera camera = (Camera){(float2){.0f, .0f}, 1.f};
 static inline vec4
 project_ortho(const float2& pos, const float2& bbox, const float2& wdims)
 {
-    const vec4 rect = (vec4){
+    const vec4 rect = (vec4){camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
-        camera.scale * (pos.x - camera.pos.x) + 0.5f * wdims.x,
+                             camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
-        camera.scale * (pos.y - camera.pos.y) + 0.5f * wdims.y,
+                             camera.scale * bbox.x, camera.scale * bbox.y};
        camera.scale * bbox.x, camera.scale * bbox.y};
    return rect;
 }
@@ -103,13 +102,12 @@ renderer_init(const int& mx, const int& my)
    SDL_InitSubSystem(SDL_INIT_VIDEO | SDL_INIT_EVENTS);
    // Setup window
-    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED,
+    window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
-                              SDL_WINDOWPOS_UNDEFINED, window_width,
+                              window_width, window_height, SDL_WINDOW_SHOWN);
                              window_height, SDL_WINDOW_SHOWN);
    // Setup SDL renderer
    renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
-    //SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
+    // SDL_SetWindowFullscreen(window, SDL_WINDOW_FULLSCREEN_DESKTOP);
    SDL_GetWindowSize(window, &window_width, &window_height);
    SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1"); // Linear filtering
@@ -118,24 +116,24 @@ renderer_init(const int& mx, const int& my)
    datasurface_height = my;
    // vec drawing uses the surface of the first component, no memory issues here
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
-        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(
+        surfaces[i] = SDL_CreateRGBSurfaceWithFormat(0, datasurface_width, datasurface_height,
-            0, datasurface_width, datasurface_height, window_bpp,
+                                                     window_bpp, SDL_PIXELFORMAT_RGBA8888);
            SDL_PIXELFORMAT_RGBA8888);
-    camera.pos = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
+    camera.pos   = (float2){.5f * tiles_per_row * datasurface_width - .5f * datasurface_width,
-                          -.5f * (num_tiles / tiles_per_row) * datasurface_height + .5f * datasurface_height};
+                          -.5f * (num_tiles / tiles_per_row) * datasurface_height +
                              .5f * datasurface_height};
    camera.scale = min(window_width / float(datasurface_width * tiles_per_row),
-                       window_height / float(datasurface_height * (num_tiles/tiles_per_row)));
+                       window_height / float(datasurface_height * (num_tiles / tiles_per_row)));
    SDL_RendererInfo renderer_info;
    SDL_GetRendererInfo(renderer, &renderer_info);
-    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width, renderer_info.max_texture_height);
+    printf("SDL renderer max texture dims: (%d, %d)\n", renderer_info.max_texture_width,
           renderer_info.max_texture_height);
    return 0;
 }
 static int
-set_pixel(const int& i, const int& j, const uint32_t& color,
+set_pixel(const int& i, const int& j, const uint32_t& color, SDL_Surface* surface)
          SDL_Surface* surface)
 {
    uint32_t* pixels           = (uint32_t*)surface->pixels;
    pixels[i + j * surface->w] = color;
@@ -143,22 +141,21 @@ set_pixel(const int& i, const int& j, const uint32_t& color,
 }
 static int
-draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
+draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer, const int& tile)
                   const int& tile)
 {
    const float xoffset = (tile % tiles_per_row) * datasurface_width;
-    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+    const float yoffset = -(tile / tiles_per_row) * datasurface_height;
    /*
    const float max = float(model_reduce_scal(mesh, RTYPE_MAX, vertex_buffer));
    const float min = float(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer));
    */
-    const float max = float(acReduceScal(RTYPE_MAX, vertex_buffer));
+    const float max   = float(acReduceScal(RTYPE_MAX, vertex_buffer));
-    const float min = float(acReduceScal(RTYPE_MIN, vertex_buffer));
+    const float min   = float(acReduceScal(RTYPE_MIN, vertex_buffer));
    const float range = fabsf(max - min);
    const float mid   = max - .5f * range;
-    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+    const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
    for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
        for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
@@ -166,29 +163,23 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
            const int idx       = AC_VTXBUF_IDX(i, j, k, mesh.info);
            const uint8_t shade = (uint8_t)(
-                255.f *
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) / range);
                (fabsf(float(mesh.vertex_buffer[vertex_buffer][idx]) - mid)) /
                range);
            uint8_t color[4]            = {0, 0, 0, 255};
            color[tile % 3]             = shade;
-            const uint32_t mapped_color = SDL_MapRGBA(
+            const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer]->format, color[0],
-                surfaces[vertex_buffer]->format, color[0], color[1], color[2],
+                                                      color[1], color[2], color[3]);
                color[3]);
            set_pixel(i, j, mapped_color, surfaces[vertex_buffer]);
        }
    }
    const float2 pos   = (float2){xoffset, yoffset};
-    const float2 bbox  = (float2){.5f * datasurface_width,
+    const float2 bbox  = (float2){.5f * datasurface_width, .5f * datasurface_height};
                                 .5f * datasurface_height};
    const float2 wsize = (float2){float(window_width), float(window_height)};
    const vec4 rectf   = project_ortho(pos, bbox, wsize);
-    SDL_Rect rect      = (SDL_Rect){
+    SDL_Rect rect      = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
-        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+                               int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
-    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer]);
                                                    surfaces[vertex_buffer]);
    SDL_RenderCopy(renderer, tex, NULL, &rect);
    SDL_DestroyTexture(tex);
@@ -196,14 +187,12 @@ draw_vertex_buffer(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer,
 }
 static int
-draw_vertex_buffer_vec(const AcMesh& mesh,
+draw_vertex_buffer_vec(const AcMesh& mesh, const VertexBufferHandle& vertex_buffer_a,
                       const VertexBufferHandle& vertex_buffer_a,
                       const VertexBufferHandle& vertex_buffer_b,
-                       const VertexBufferHandle& vertex_buffer_c,
+                       const VertexBufferHandle& vertex_buffer_c, const int& tile)
                       const int& tile)
 {
    const float xoffset = (tile % tiles_per_row) * datasurface_width;
-    const float yoffset = - (tile / tiles_per_row) * datasurface_height;
+    const float yoffset = -(tile / tiles_per_row) * datasurface_height;
    /*
    const float maxx = float(
@@ -215,52 +204,41 @@ draw_vertex_buffer_vec(const AcMesh& mesh,
            min(model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_b),
                model_reduce_scal(mesh, RTYPE_MIN, vertex_buffer_c))));
    */
-    const float maxx = float(
+    const float maxx  = float(max(
-        max(acReduceScal(RTYPE_MAX, vertex_buffer_a),
+        acReduceScal(RTYPE_MAX, vertex_buffer_a),
-            max(acReduceScal(RTYPE_MAX, vertex_buffer_b),
+        max(acReduceScal(RTYPE_MAX, vertex_buffer_b), acReduceScal(RTYPE_MAX, vertex_buffer_c))));
-                acReduceScal(RTYPE_MAX, vertex_buffer_c))));
+    const float minn  = float(min(
-    const float minn = float(
+        acReduceScal(RTYPE_MIN, vertex_buffer_a),
-        min(acReduceScal(RTYPE_MIN, vertex_buffer_a),
+        min(acReduceScal(RTYPE_MIN, vertex_buffer_b), acReduceScal(RTYPE_MIN, vertex_buffer_c))));
            min(acReduceScal(RTYPE_MIN, vertex_buffer_b),
                acReduceScal(RTYPE_MIN, vertex_buffer_c))));
    const float range = fabsf(maxx - minn);
    const float mid   = maxx - .5f * range;
-    const int k = k_slice; //mesh.info.int_params[AC_mz] / 2;
+    const int k = k_slice; // mesh.info.int_params[AC_mz] / 2;
    for (int j = 0; j < mesh.info.int_params[AC_my]; ++j) {
        for (int i = 0; i < mesh.info.int_params[AC_mx]; ++i) {
            ERRCHK(i < datasurface_width && j < datasurface_height);
            const int idx   = AC_VTXBUF_IDX(i, j, k, mesh.info);
            const uint8_t r = (uint8_t)(
-                255.f *
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) / range);
                (fabsf(float(mesh.vertex_buffer[vertex_buffer_a][idx]) - mid)) /
                range);
            const uint8_t g = (uint8_t)(
-                255.f *
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) / range);
                (fabsf(float(mesh.vertex_buffer[vertex_buffer_b][idx]) - mid)) /
                range);
            const uint8_t b = (uint8_t)(
-                255.f *
+                255.f * (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) / range);
-                (fabsf(float(mesh.vertex_buffer[vertex_buffer_c][idx]) - mid)) /
+            const uint32_t mapped_color = SDL_MapRGBA(surfaces[vertex_buffer_a]->format, r, g, b,
-                range);
+                                                      255);
            const uint32_t mapped_color = SDL_MapRGBA(
                surfaces[vertex_buffer_a]->format, r, g, b, 255);
            set_pixel(i, j, mapped_color, surfaces[vertex_buffer_a]);
        }
    }
    const float2 pos   = (float2){xoffset, yoffset};
-    const float2 bbox  = (float2){.5f * datasurface_width,
+    const float2 bbox  = (float2){.5f * datasurface_width, .5f * datasurface_height};
                                 .5f * datasurface_height};
    const float2 wsize = (float2){float(window_width), float(window_height)};
    const vec4 rectf   = project_ortho(pos, bbox, wsize);
-    SDL_Rect rect      = (SDL_Rect){
+    SDL_Rect rect      = (SDL_Rect){int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
-        int(rectf.x - rectf.w), int(wsize.y - rectf.y - rectf.h),
+                               int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
        int(ceil(2.f * rectf.w)), int(ceil(2.f * rectf.h))};
-    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer,
+    SDL_Texture* tex = SDL_CreateTextureFromSurface(renderer, surfaces[vertex_buffer_a]);
                                                    surfaces[vertex_buffer_a]);
    SDL_RenderCopy(renderer, tex, NULL, &rect);
    SDL_DestroyTexture(tex);
@@ -272,13 +250,11 @@ renderer_draw(const AcMesh& mesh)
 {
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
        draw_vertex_buffer(mesh, VertexBufferHandle(i), i);
-    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
+    draw_vertex_buffer_vec(mesh, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, NUM_VTXBUF_HANDLES);
                           NUM_VTXBUF_HANDLES);
    // Drawing done, present
    SDL_RenderPresent(renderer);
-    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b,
+    SDL_SetRenderDrawColor(renderer, color_bg.r, color_bg.g, color_bg.b, color_bg.a);
                           color_bg.a);
    SDL_RenderClear(renderer);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -404,14 +380,14 @@ run_renderer(void)
 /* Step the simulation */
 #if 1
-        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
                                        VTXBUF_UUZ);
        const AcReal dt   = host_timestep(umax, mesh_info);
        acIntegrate(dt);
 #else
        ModelMesh* model_mesh = modelmesh_create(mesh->info);
-        const AcReal umax = AcReal(model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
+        const AcReal umax     = AcReal(
-        const AcReal dt   = host_timestep(umax, mesh_info);
+            model_reduce_vec(*model_mesh, RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ));
        const AcReal dt = host_timestep(umax, mesh_info);
        acmesh_to_modelmesh(*mesh, model_mesh);
        model_rk3(dt, model_mesh);
        modelmesh_to_acmesh(*model_mesh, mesh);
@@ -425,7 +401,7 @@ run_renderer(void)
        /* Render */
        const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
        if (timer_diff_sec >= desired_frame_time) {
-            //acStore(mesh);
+            // acStore(mesh);
            const int num_vertices = mesh->info.int_params[AC_mxy];
            const int3 dst         = (int3){0, 0, k_slice};
            acStoreWithOffset(dst, num_vertices, mesh);
--- a/src/standalone/simulation.cc
+++ b/src/standalone/simulation.cc
@@ -60,23 +60,23 @@ print_diagnostics(const AcMesh& mesh, const int& step, const AcReal& dt)
 }
 */
-//Write all setting info into a separate ascii file. This is done to guarantee
+// Write all setting info into a separate ascii file. This is done to guarantee
-//that we have the data specifi information in the thing, even though in
+// that we have the data specifi information in the thing, even though in
-//principle these things are in the astaroth.conf.
+// principle these things are in the astaroth.conf.
-static inline 
+static inline void
-void write_mesh_info(const AcMeshInfo* config)
+write_mesh_info(const AcMeshInfo* config)
 {
    FILE* infotxt;
-    infotxt = fopen("purge.sh","w");
+    infotxt = fopen("purge.sh", "w");
    fprintf(infotxt, "#!/bin/bash\n");
    fprintf(infotxt, "rm *.list *.mesh *.ts purge.sh\n");
    fclose(infotxt);
-    infotxt = fopen("mesh_info.list","w");
+    infotxt = fopen("mesh_info.list", "w");
-    //Total grid dimensions
+    // Total grid dimensions
    fprintf(infotxt, "int  AC_mx        %i \n", config->int_params[AC_mx]);
    fprintf(infotxt, "int  AC_my        %i \n", config->int_params[AC_my]);
    fprintf(infotxt, "int  AC_mz        %i \n", config->int_params[AC_mz]);
@@ -96,28 +96,26 @@ void write_mesh_info(const AcMeshInfo* config)
    fprintf(infotxt, "real AC_inv_dsx   %e \n", (double)config->real_params[AC_inv_dsx]);
    fprintf(infotxt, "real AC_inv_dsy   %e \n", (double)config->real_params[AC_inv_dsy]);
    fprintf(infotxt, "real AC_inv_dsz   %e \n", (double)config->real_params[AC_inv_dsz]);
-    fprintf(infotxt, "real AC_dsmin     %e \n", (double)config->real_params[AC_dsmin  ]);
+    fprintf(infotxt, "real AC_dsmin     %e \n", (double)config->real_params[AC_dsmin]);
    /* Additional helper params */
    // Int helpers
-    fprintf(infotxt, "int  AC_mxy       %i \n", config->int_params[AC_mxy ]);
+    fprintf(infotxt, "int  AC_mxy       %i \n", config->int_params[AC_mxy]);
-    fprintf(infotxt, "int  AC_nxy       %i \n", config->int_params[AC_nxy ]);
+    fprintf(infotxt, "int  AC_nxy       %i \n", config->int_params[AC_nxy]);
    fprintf(infotxt, "int  AC_nxyz      %i \n", config->int_params[AC_nxyz]);
    // Real helpers
    fprintf(infotxt, "real AC_cs2_sound %e \n", (double)config->real_params[AC_cs2_sound]);
-    fprintf(infotxt, "real AC_cv_sound  %e \n", (double)config->real_params[AC_cv_sound ]);
+    fprintf(infotxt, "real AC_cv_sound  %e \n", (double)config->real_params[AC_cv_sound]);
    fclose(infotxt);
 }
-
+// This funtion writes a run state into a set of C binaries. For the sake of
-//This funtion writes a run state into a set of C binaries. For the sake of
+// accuracy, all floating point numbers are to be saved in long double precision
-//accuracy, all floating point numbers are to be saved in long double precision
+// regardless of the choise of accuracy during runtime.
 //regardless of the choise of accuracy during runtime. 
 static inline void
-save_mesh(const AcMesh &save_mesh, const int step, 
+save_mesh(const AcMesh& save_mesh, const int step, const AcReal t_step)
          const AcReal t_step)
 {
    FILE* save_ptr;
@@ -128,7 +126,7 @@ save_mesh(const AcMesh &save_mesh, const int step,
        char cstep[10];
        char bin_filename[80] = "\0";
-        //sprintf(bin_filename, "");
+        // sprintf(bin_filename, "");
        sprintf(cstep, "%d", step);
@@ -139,28 +137,25 @@ save_mesh(const AcMesh &save_mesh, const int step,
        printf("Savefile %s \n", bin_filename);
-        save_ptr = fopen(bin_filename,"wb");
+        save_ptr = fopen(bin_filename, "wb");
-        //Start file with time stamp
+        // Start file with time stamp
-        long double write_long_buf =  (long double) t_step;
+        long double write_long_buf = (long double)t_step;
        fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
-        //Grid data
+        // Grid data
        for (size_t i = 0; i < n; ++i) {
-            const AcReal point_val = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
+            const AcReal point_val     = save_mesh.vertex_buffer[VertexBufferHandle(w)][i];
-            long double write_long_buf =  (long double) point_val;
+            long double write_long_buf = (long double)point_val;
            fwrite(&write_long_buf, sizeof(long double), 1, save_ptr);
        }
        fclose(save_ptr);
    }
 }
 // This function prints out the diagnostic values to std.out and also saves and
 // appends an ascii file to contain all the result.
 static inline void
-print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *diag_file)
+print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE* diag_file)
 {
    AcReal buf_rms, buf_max, buf_min;
@@ -174,11 +169,10 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
    // MV: The ordering in the earlier version was wrong in terms of variable
    // MV: name and its diagnostics.
    printf("Step %d, t_step %.3e, dt %e s\n", step, double(t_step), double(dt));
-    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total",
+    printf("  %*s: min %.3e,\trms %.3e,\tmax %.3e\n", max_name_width, "uu total", double(buf_min),
-           double(buf_min), double(buf_rms), double(buf_max));
+           double(buf_rms), double(buf_max));
-    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), 
+    fprintf(diag_file, "%d %e %e %e %e %e ", step, double(t_step), double(dt), double(buf_min),
-           double(buf_min), double(buf_rms), double(buf_max));
+            double(buf_rms), double(buf_max));
    // Calculate rms, min and max from the variables as scalars
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
@@ -194,11 +188,11 @@ print_diagnostics(const int step, const AcReal dt, const AcReal t_step, FILE *di
    fprintf(diag_file, "\n");
 }
-    /* 
+/*
-        MV NOTE: At the moment I have no clear idea how to calculate magnetic
+    MV NOTE: At the moment I have no clear idea how to calculate magnetic
-        diagnostic variables from grid. Vector potential measures have a limited
+    diagnostic variables from grid. Vector potential measures have a limited
-        value. TODO: Smart way to get brms, bmin and bmax.
+    value. TODO: Smart way to get brms, bmin and bmax.
-    */ 
+*/
 int
 run_simulation(void)
@@ -213,8 +207,7 @@ run_simulation(void)
    acInit(mesh_info);
    acLoad(*mesh);
-
+    FILE* diag_file;
    FILE *diag_file;
    diag_file = fopen("timeseries.ts", "a");
    // TODO Get time from earlier state.
    AcReal t_step = 0.0;
@@ -222,7 +215,8 @@ run_simulation(void)
    // Generate the title row.
    fprintf(diag_file, "step  t_step  dt  uu_total_min  uu_total_rms  uu_total_max  ");
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i], vtxbuf_names[i]);
+        fprintf(diag_file, "%s_min  %s_rms  %s_max  ", vtxbuf_names[i], vtxbuf_names[i],
                vtxbuf_names[i]);
    }
    fprintf(diag_file, "\n");
@@ -234,17 +228,16 @@ run_simulation(void)
    acStore(mesh);
    save_mesh(*mesh, 0, t_step);
-    const int max_steps = mesh_info.int_params[AC_max_steps];
+    const int max_steps  = mesh_info.int_params[AC_max_steps];
    const int save_steps = mesh_info.int_params[AC_save_steps];
-    const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; //TODO Get from mesh_info
+    const int bin_save_steps = mesh_info.int_params[AC_bin_steps]; // TODO Get from mesh_info
    AcReal bin_save_t = mesh_info.real_params[AC_bin_save_t];
    AcReal bin_crit_t = bin_save_t;
    /* Step the simulation */
    for (int i = 1; i < max_steps; ++i) {
-        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY,
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
                                        VTXBUF_UUZ);
        const AcReal dt   = host_timestep(umax, mesh_info);
        acIntegrate(dt);
@@ -254,33 +247,32 @@ run_simulation(void)
        if ((i % save_steps) == 0) {
            /*
-		print_diagnostics() writes out both std.out printout from the
+                print_diagnostics() writes out both std.out printout from the
-		results and saves the diagnostics into a table for ascii file
+                results and saves the diagnostics into a table for ascii file
                timeseries.ts.
            */
            print_diagnostics(i, dt, t_step, diag_file);
            /*
-		We would also might want an XY-average calculating funtion,
+                We would also might want an XY-average calculating funtion,
-		which can be very useful when observing behaviour of turbulent
+                which can be very useful when observing behaviour of turbulent
                simulations. (TODO)
            */
        }
        /* Save the simulation state and print diagnostics */
        if ((i % bin_save_steps) == 0 || t_step >= bin_crit_t) {
            /*
-		This loop saves the data into simple C binaries which can be
+                This loop saves the data into simple C binaries which can be
                used for analysing the data snapshots closely.
                Saving simulation state should happen in a separate stage. We do
                not want to save it as often as diagnostics. The file format
                should IDEALLY be HDF5 which has become a well supported, portable and
-		reliable data format when it comes to HPC applications.
+                reliable data format when it comes to HPC applications.
-		However, implementing it will have to for more simpler approach
+                However, implementing it will have to for more simpler approach
                to function. (TODO?)
            */
@@ -300,9 +292,7 @@ run_simulation(void)
            save_mesh(*mesh, i, t_step);
            bin_crit_t += bin_save_t;
        }
    }
    //////Save the final snapshot
@@ -318,25 +308,3 @@ run_simulation(void)
    return 0;
 }
--- a/src/standalone/timer_hires.h
+++ b/src/standalone/timer_hires.h
@@ -52,8 +52,7 @@ timer_diff_nsec(const Timer start)
 {
    Timer end;
    timer_reset(&end);
-    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l +
+    const long diff = (end.tv_sec - start.tv_sec) * 1000000000l + (end.tv_nsec - start.tv_nsec);
                      (end.tv_nsec - start.tv_nsec);
    return diff;
 }