I have inspected the branch and it complies and functions fine in my system. NOTE: I could not test MPI at the moment, but it should not prevent the merge.

Merged in mpi-to-master-merge-candidate-2020-06-01 (pull request #12) Astaroth 2.3
2020-08-19 06:54:34 +00:00
parent d6d5920553 0872695c48
commit eb86ce09f4
36 changed files with 2748 additions and 601 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,8 @@
 ## CMake settings
-cmake_minimum_required(VERSION 3.9) # Required for first-class CUDA support
+# V3.9 required for first-class CUDA support
+# V3.17 required for the FindCUDAToolkit package
+# V3.18 required for CMAKE_CUDA_ARCHITECTURES
+cmake_minimum_required(VERSION 3.18)
 find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
 find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)

@@ -14,12 +17,19 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 11)

-find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
-cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
-string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
-set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")
-
+## CUDA
+# GPU, compute capability
+# K40, 3.5
+# K80, 3.7
+# P100, 6.0
+# V100, 7.0
+if (NOT CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 60 70) # Default
+else ()
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) # User-specified
+endif()
+string (REPLACE " " "," CUDA_COMMON_FLAGS "${COMMON_FLAGS}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options=${CUDA_COMMON_FLAGS}")

 ## Build type
 if(NOT CMAKE_BUILD_TYPE)
@@ -53,7 +63,8 @@ message(STATUS "AC module dir: ${DSL_MODULE_DIR}")
 file(GLOB DSL_SOURCES ${DSL_MODULE_DIR}/*
                      ${CMAKE_SOURCE_DIR}/acc/stdlib/*)
 set(DSL_HEADERS "${PROJECT_BINARY_DIR}/user_kernels.h"
-                "${PROJECT_BINARY_DIR}/user_defines.h")
+                "${PROJECT_BINARY_DIR}/user_defines.h"
+                "${PROJECT_BINARY_DIR}/astaroth.f90")

 add_custom_command (
    COMMENT "Building ACC objects ${DSL_MODULE_DIR}"
@@ -79,6 +90,11 @@ include_directories(src/common)                                # Common headers
 include_directories(${CMAKE_BINARY_DIR})                       # DSL headers
 include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # CUDA headers

+if (MPI_ENABLED)
+    find_package(MPI REQUIRED)
+    include_directories(${MPI_CXX_INCLUDE_DIRS})
+endif()
+
 ## Subdirectories
 add_subdirectory(src/utils)
 add_subdirectory(src/core/kernels)
@@ -89,6 +105,10 @@ if (BUILD_SAMPLES)
    add_subdirectory(samples/cpptest)
    add_subdirectory(samples/mpitest)
    add_subdirectory(samples/benchmark)
+    add_subdirectory(samples/bwtest)
+    add_subdirectory(samples/genbenchmarkscripts)
+    add_subdirectory(samples/mpi_reduce_bench)
+    add_subdirectory(samples/fortrantest)
 endif()

 if (BUILD_STANDALONE)
--- a/README.md
+++ b/README.md
@@ -40,6 +40,7 @@ In the base directory, run
 | Option | Description | Default |
 |--------|-------------|---------|
 | CMAKE_BUILD_TYPE | Selects the build type. Possible values: Debug, Release, RelWithDebInfo, MinSizeRel. See (CMake documentation)[https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html] for more details. | Release |
+| CUDA_ARCHITECTURES | Selects CUDA architecture support. Multiple architectures delimited by `;`. See (CMake documentation)[https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html] for more details. | "60;70" |
 | DOUBLE_PRECISION | Generates double precision code. | OFF |
 | BUILD_SAMPLES | Builds projects in samples subdirectory. | OFF |
 | BUILD_STANDALONE | Builds a standalone library for testing, benchmarking and simulation. | ON |
@@ -67,6 +68,7 @@ See `analysis/python/` directory of existing data visualization and analysis scr

 * `astaroth/include/astaroth.h`: Astaroth main header. Contains the interface for accessing single- and multi-GPU layers.
 * `astaroth/include/astaroth_utils.h`: Utility library header. Provides functions for performing common tasks on host, such as allocating and verifying meshes.
+* `<build directory>/astaroth.f90`: Fortran interface to Astaroth. Generated when building the library.

 ## FAQ

--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -39,9 +39,11 @@ ASTNode* root = NULL;
 // Output files
 static FILE* DSLHEADER  = NULL;
 static FILE* CUDAHEADER = NULL;
+static FILE* FHEADER    = NULL;

 static const char* dslheader_filename  = "user_defines.h";
 static const char* cudaheader_filename = "user_kernels.h";
+static const char* fheader_filename    = "astaroth.f90";

 // Forward declaration of yyparse
 int yyparse(void);
@@ -98,7 +100,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
    ['<'] = "<",
    ['>'] = ">",
    ['!'] = "!",
-    ['.'] = "."};
+    ['.'] = ".",
+};

 static const char*
 translate(const int token)
@@ -261,9 +264,8 @@ traverse(const ASTNode* node)
            if (typequal->token == KERNEL) {
                fprintf(CUDAHEADER, "GEN_KERNEL_PARAM_BOILERPLATE");
                if (node->lhs != NULL) {
-                    fprintf(
-                        stderr,
-                        "Syntax error: function parameters for Kernel functions not allowed!\n");
+                    fprintf(stderr, "Syntax error: function parameters for Kernel functions not "
+                                    "allowed!\n");
                }
            }
            else if (typequal->token == PREPROCESSED) {
@@ -597,65 +599,173 @@ generate_preprocessed_structures(void)
 }

 static void
-generate_header(void)
+generate_headers(void)
 {
+    // Fortran interface
+    const char* fortran_interface = R"(
+!  -*-f90-*-  (for emacs)    vim:set filetype=fortran:  (for vim)
+
+! Utils (see astaroth_fortran.cc for definitions)
+external acupdatebuiltinparams
+external acgetdevicecount
+
+! Device interface (see astaroth_fortran.cc for definitions)
+external acdevicecreate, acdevicedestroy
+external acdeviceprintinfo
+external acdeviceloadmeshinfo
+external acdeviceloadmesh, acdevicestoremesh
+external acdeviceintegratesubstep
+external acdeviceperiodicboundconds
+external acdeviceswapbuffers
+external acdevicereducescal, acdevicereducevec
+external acdevicesynchronizestream
+  )";
+    fprintf(FHEADER, "%s\n", fortran_interface);
+
    fprintf(DSLHEADER, "#pragma once\n");

    // Int params
    fprintf(DSLHEADER, "#define AC_FOR_USER_INT_PARAM_TYPES(FUNC)");
+    int enumcounter = 0;
    for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
        if (symbol_table[i].type_specifier == INT && symbol_table[i].type_qualifier == UNIFORM) {
            fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
        }
    }
    fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_INT_PARAMS = %d\n\n", enumcounter);

    // Int3 params
    fprintf(DSLHEADER, "#define AC_FOR_USER_INT3_PARAM_TYPES(FUNC)");
+    enumcounter = 0;
    for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
        if (symbol_table[i].type_specifier == INT3 && symbol_table[i].type_qualifier == UNIFORM) {
            fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
        }
    }
    fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_INT3_PARAMS = %d\n\n", enumcounter);

    // Scalar params
    fprintf(DSLHEADER, "#define AC_FOR_USER_REAL_PARAM_TYPES(FUNC)");
+    enumcounter = 0;
    for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
        if (symbol_table[i].type_specifier == SCALAR && symbol_table[i].type_qualifier == UNIFORM) {
            fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
        }
    }
    fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_REAL_PARAMS = %d\n\n", enumcounter);

    // Vector params
    fprintf(DSLHEADER, "#define AC_FOR_USER_REAL3_PARAM_TYPES(FUNC)");
+    enumcounter = 0;
    for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
        if (symbol_table[i].type_specifier == VECTOR && symbol_table[i].type_qualifier == UNIFORM) {
            fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
        }
    }
    fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_REAL3_PARAMS = %d\n\n", enumcounter);

    // Scalar fields
    fprintf(DSLHEADER, "#define AC_FOR_VTXBUF_HANDLES(FUNC)");
+    enumcounter = 0;
    for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
        if (symbol_table[i].type_specifier == SCALARFIELD &&
            symbol_table[i].type_qualifier == UNIFORM) {
            fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
        }
    }
    fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_VTXBUF_HANDLES = %d\n\n", enumcounter);

    // Scalar arrays
    fprintf(DSLHEADER, "#define AC_FOR_SCALARARRAY_HANDLES(FUNC)");
+    enumcounter = 0;
    for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
        if (symbol_table[i].type_specifier == SCALARARRAY &&
            symbol_table[i].type_qualifier == UNIFORM) {
            fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
+
+            fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
+                    enumcounter);
+            ++enumcounter;
        }
    }
    fprintf(DSLHEADER, "\n\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter);
+
+    // Streams
+    const size_t nstreams = 20;
+    for (size_t i = 0; i < nstreams; ++i) {
+        fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i);
+        fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i);
+    }
+    fprintf(DSLHEADER, "#define NUM_STREAMS (%lu)\n", nstreams);
+    fprintf(DSLHEADER, "#define STREAM_DEFAULT (STREAM_0)\n");
+    fprintf(DSLHEADER, "#define STREAM_ALL (NUM_STREAMS)\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: NUM_STREAMS = %lu\n", nstreams);
+    fprintf(FHEADER, "integer(c_int), parameter :: STREAM_DEFAULT = STREAM_0\n");
+    fprintf(FHEADER, "integer(c_int), parameter :: STREAM_ALL = NUM_STREAMS\n");
+
+    fprintf(DSLHEADER, "typedef int Stream;\n\n");
+
+    // Reduction types
+    size_t counter = 0;
+    fprintf(DSLHEADER, "#define RTYPE_MAX (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MAX = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_MIN (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MIN = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_RMS (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_RMS = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_RMS_EXP (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_RMS_EXP = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "#define RTYPE_SUM (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_SUM = %lu\n", counter);
+    ++counter;
+
+    fprintf(DSLHEADER, "typedef int ReductionType;\n");
+    fprintf(DSLHEADER, "#define NUM_REDUCTION_TYPES (%lu)\n", counter);
+    fprintf(FHEADER, "integer(c_int), parameter :: NUM_REDUCTION_TYPES = %lu\n", counter);
+
+    // Fortran structs
+    const char* fortran_structs = R"(
+type, bind(C) :: AcMeshInfo
+  integer(c_int), dimension(AC_NUM_INT_PARAMS)      :: int_params
+  integer(c_int), dimension(AC_NUM_INT3_PARAMS, 3)  :: int3_params
+  real, dimension(AC_NUM_REAL_PARAMS)               :: real_params
+  real, dimension(AC_NUM_REAL3_PARAMS, 3)           :: real3_params
+end type AcMeshInfo
+  )";
+    fprintf(FHEADER, "%s\n", fortran_structs);
 }

 static void
@@ -681,20 +791,21 @@ main(void)

    DSLHEADER  = fopen(dslheader_filename, "w+");
    CUDAHEADER = fopen(cudaheader_filename, "w+");
+    FHEADER    = fopen(fheader_filename, "w+");
    assert(DSLHEADER);
    assert(CUDAHEADER);
+    assert(FHEADER);

    // Add built-in param symbols
-    for (size_t i = 0; i < ARRAY_SIZE(builtin_int_params); ++i) {
+    for (size_t i = 0; i < ARRAY_SIZE(builtin_int_params); ++i)
        add_symbol(SYMBOLTYPE_OTHER, UNIFORM, INT, builtin_int_params[i]);
-    }
-    for (size_t i = 0; i < ARRAY_SIZE(builtin_int3_params); ++i) {
+
+    for (size_t i = 0; i < ARRAY_SIZE(builtin_int3_params); ++i)
        add_symbol(SYMBOLTYPE_OTHER, UNIFORM, INT3, builtin_int3_params[i]);
-    }

    // Generate
    traverse(root);
-    generate_header();
+    generate_headers();
    generate_preprocessed_structures();
    generate_library_hooks();

@@ -703,9 +814,12 @@ main(void)
    // Cleanup
    fclose(DSLHEADER);
    fclose(CUDAHEADER);
+    fclose(FHEADER);
    astnode_destroy(root);

    fprintf(stdout, "-- Generated %s\n", dslheader_filename);
    fprintf(stdout, "-- Generated %s\n", cudaheader_filename);
+    fprintf(stdout, "-- Generated %s\n", fheader_filename);
+
    return EXIT_SUCCESS;
 }
--- a/bitbucket-pipelines.yml
+++ b/bitbucket-pipelines.yml
@@ -12,6 +12,8 @@ image: nvidia/cuda
 # ==> Updating the kernel drivers by ourselves probably requires creating our own docker image.
 # ===> Which might not even work since I don't know what kind of hardware we're running on (lspci was not available)

+options:
+  max-time: 5 # Max time allowed for building (minutes) 
 pipelines:
  # default: # Default is run at every push but we have only 500 build minutes / month so that probably wouldn't work out
  custom: # Manual/scheduled building only
@@ -20,7 +22,11 @@ pipelines:
        script: # Modify the commands below to build your repository.
          - mkdir -p build && cd build
          - apt-get update
-          - apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev
+          - apt-get install -y apt-transport-https ca-certificates gnupg software-properties-common wget
+          - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+          - apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+          - apt-get update
+          - apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev gfortran
          - cmake -DDSL_MODULE_DIR="acc/mhd_solver" -DBUILD_STANDALONE=ON -DBUILD_UTILS=ON -DBUILD_RT_VISUALIZATION=OFF -DBUILD_SAMPLES=ON -DDOUBLE_PRECISION=OFF -DMULTIGPU_ENABLED=ON -DMPI_ENABLED=OFF .. # Single precision
          - make -j
          - rm -rf *
--- a/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
+++ b/doc/Astaroth_API_specification_and_user_manual/API_specification_and_user_manual.md
@@ -79,7 +79,7 @@ typedef enum {
 ```

 The API is divided into layers which differ in the level of control provided over the execution.
-There are two primary layers:
+There are three primary layers:

 * Device layer
    * Functions start with acDevice.
@@ -92,7 +92,13 @@ There are two primary layers:
    * All functions are asynchronous and executed concurrently on all devices in the node.
    * Subsequent functions called in the same stream (see Section #Streams and synchronization) are guaranteed to be synchronous.

-Finally, a third layer is provided for convenience and backwards compatibility.
+* Grid layer
+    * Functions start with acGrid.
+    * Provides control over all devices on multiple node.
+    * Requires MPI. `MPI_Init()` must be called before calling any acGrid functions.
+    * Streams are used to control concurrency the same way as on acDevice and acNode layers.
+
+Finally, a fourth layer is provided for convenience and backwards compatibility.

 * Astaroth layer (deprecated)
    * Functions start with `ac` only, f.ex. acInit().
@@ -126,6 +132,12 @@ AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* co
 AcResult acNodeAutoOptimize(const Node node);
 ```

+Grid layer.
+```C
+AcResult acGridInit(const AcMeshInfo info);
+AcResult acGridQuit(void);
+```
+
 General helper functions.
 ```C
 size_t acVertexBufferSize(const AcMeshInfo info);
@@ -159,6 +171,7 @@ AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
                                          const AcMesh host_mesh,
                                          const VertexBufferHandle vtxbuf_handle, const int3 src,
                                          const int3 dst, const int num_vertices);
+AcResult acGridLoadMesh(const AcMesh host_mesh, const Stream stream);
 ```

 Storing meshes and vertex buffer to host memory.
@@ -180,6 +193,7 @@ AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
                                           const VertexBufferHandle vtxbuf_handle, const int3 src,
                                           const int3 dst, const int num_vertices,
                                           AcMesh* host_mesh);
+AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
 ```

 Transferring data between devices
@@ -242,6 +256,13 @@ AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionT
 AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
                         const VertexBufferHandle vtxbuf2, AcReal* result);
+AcResult acGridIntegrate(const Stream stream, const AcReal dt);
+AcResult acGridPeriodicBoundconds(const Stream stream);
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
 ```

 Finally, there's a library function that is automatically generated for all user-specified `Kernel`
@@ -261,10 +282,13 @@ yet completed. Therefore special care must be taken in order to ensure proper sy

 Synchronization is done using `Stream` primitives, defined as
 ```C
-typedef enum { STREAM_DEFAULT, STREAM_0, ..., STREAM_16, NUM_STREAMS } Stream;
+typedef enum {STREAM_0, ..., STREAM_15, NUM_STREAMS};
+#define STREAM_DEFAULT (STREAM_0)
 #define STREAM_ALL (NUM_STREAMS)
 ```

+> **Note:** There is guaranteed to be at least 16 distinct streams.
+
 Functions queued in the same stream will be executed sequentially. If two or more consequent
 functions are queued in different streams, then these functions may execute in parallel. For
 additional control over streams, there is a barrier synchronization function which blocks execution
@@ -286,6 +310,7 @@ Astaroth API provides the following functions for barrier synchronization.
 AcResult acSynchronize(void);
 AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
 AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
+AcResult acGridSynchronizeStream(const Stream stream);
 ```

 ## Data Synchronization
@@ -408,13 +433,16 @@ int mz = info.int_params[AC_mz];
 after initialization.


-### Decomposition
-Grids and subgrids contain the dimensions of the the mesh decomposed to multiple devices.
+### Decomposition (`acNode` layer)
+
+> **Note:** This section describes implementation details specific to the acNode layer. The acGrid layer is not related to the `GridDims` structure described here. 
+
+`GridDims` contains the dimensions of the the mesh decomposed to multiple devices.
 ```C
 typedef struct {
    int3 m; // Size of the simulation domain (includes the ghost zones)
    int3 n; // Size of the computational domain (without ghost zones)
-} Grid;
+} GridDims;
 ```

 As briefly discussed in the section Data synchronization, a `Mesh` is distributed to multiple
@@ -427,8 +455,6 @@ Let *i* be the device id. The portion of the halos shared by neighboring devices
 `acNodeSynchronizeVertexBuffer` and `acNodeSynchronizeMesh` communicate these shared areas among
 the devices in the node.

-> **Note:** The decomposition scheme is subject to change.
-
 # Astaroth Domain-Specific Language

 We designed the Astaroth Domain-specific Language (DSL) for expressing stencil computations in a
@@ -478,8 +504,8 @@ In addition to basic datatypes in C/C++/CUDA, such as int and int3, we provide t
 | Complex     | A tuple of two 32- or 64-bit floating-point numbers. The real part is stored in member .x, while the imaginary component is in .y. Basic operations, such as multiplication, are defined as built-in functions.                                                                                                                   | std::complex<float> or std::complex<double>                                                          |
 | Matrix      | A tuple of three Vectors. Is stored in column-major order, f.ex. Matrix[i][j] is the component on row i, column j. (TODO recheck specs.)                                                                                                                                                                                          | float3[3] or double3[3]                                                                              |
 | ScalarArray | A one-dimensional array of Scalars stored in device memory. Given mesh dimensions (mx, my, mz), consists of max(mx, max(my, mz)) elements.                                                                                                                                                                                        | float[] or double[]                                                                                  |
-| ScalarField | An abstraction of a three-dimensional scalar field stored in device memory. Is implemented as a handle to a one-dimensional Scalar array consisting of input and output segments. The data is stored linearly in order i + j * mx + k * mx * my, given some vertex index (i, j, k) and mesh constisting of (mx, my, mz) vertices. | float[2][] or double[2][]                                                                            |
-| VectorField | An abstraction of a three-dimensional vector field stored in device memory. Is implemented as a tuple of three ScalarField handles.                                                                                                                                                                                               | Three distinct float[2][] or double[2][] arrays for each component. Stored as a structure of arrays. |
+| ScalarField | A three-dimensional scalar field stored in row-wise scan order where coordinates `(i, j, k)` correspond to a one-dimensional index `i + j * mx + k * mx * my`. Consists of two buffers, one used for input and another one for output. | Two float[] or double[] arrays                                                                            |
+| VectorField | A three-dimensional vector field. Consists of three `ScalarFields`. | Three `ScalarFields` stored contiguously in memory as a structure of arrays |

 ## Precision

--- a/2
+++ b/2
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Astaroth"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = 2.2
+PROJECT_NUMBER         = 2.3

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -50,6 +50,8 @@ typedef struct {

 typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;

+/*
+// Deprecated, defined during code generation
 typedef enum {
    RTYPE_MAX,
    RTYPE_MIN,
@@ -58,29 +60,7 @@ typedef enum {
    RTYPE_SUM,
    NUM_REDUCTION_TYPES
 } ReductionType;
-
-typedef enum {
-    STREAM_DEFAULT,
-    STREAM_0,
-    STREAM_1,
-    STREAM_2,
-    STREAM_3,
-    STREAM_4,
-    STREAM_5,
-    STREAM_6,
-    STREAM_7,
-    STREAM_8,
-    STREAM_9,
-    STREAM_10,
-    STREAM_11,
-    STREAM_12,
-    STREAM_13,
-    STREAM_14,
-    STREAM_15,
-    STREAM_16,
-    NUM_STREAMS
-} Stream;
-#define STREAM_ALL (NUM_STREAMS)
+*/

 #define AC_GEN_ID(X) X,
 typedef enum {
@@ -320,6 +300,15 @@ AcResult acGridIntegrate(const Stream stream, const AcReal dt);

 /** */
 AcResult acGridPeriodicBoundconds(const Stream stream);
+
+/** TODO */
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
+                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
+
+/** TODO */
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
+                         const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                         const VertexBufferHandle vtxbuf2, AcReal* result);
 #endif // AC_MPI_ENABLED

 /*
@@ -573,6 +562,20 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const
 /** */
 AcResult acDeviceRunMPITest(void);

+/*
+ * =============================================================================
+ * Helper functions
+ * =============================================================================
+ */
+/** Updates the built-in parameters based on nx, ny and nz */
+AcResult acUpdateBuiltinParams(AcMeshInfo* config);
+
+/** Creates a mesh stored in host memory */
+AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
+
+/** Destroys a mesh stored in host memory */
+AcResult acMeshDestroy(AcMesh* mesh);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/include/astaroth_utils.h
+++ b/include/astaroth_utils.h
@@ -29,17 +29,57 @@
 extern "C" {
 #endif

+#include <stdbool.h>
+
+#define ERROR_LABEL_LENGTH 30
+
+typedef struct {
+    char label[ERROR_LABEL_LENGTH];
+    VertexBufferHandle handle;
+    AcReal model;
+    AcReal candidate;
+    long double abs_error;
+    long double ulp_error;
+    long double rel_error;
+    AcReal maximum_magnitude;
+    AcReal minimum_magnitude;
+} Error;
+
+typedef struct {
+    char label[ERROR_LABEL_LENGTH];
+    VertexBufferHandle vtxbuf;
+    ReductionType rtype;
+    AcReal candidate;
+} AcScalReductionTestCase;
+
+typedef struct {
+    char label[ERROR_LABEL_LENGTH];
+    VertexBufferHandle a;
+    VertexBufferHandle b;
+    VertexBufferHandle c;
+    ReductionType rtype;
+    AcReal candidate;
+} AcVecReductionTestCase;
+
+/** TODO comment */
+Error acGetError(AcReal model, AcReal candidate);
+
+/** TODO comment */
+bool printErrorToScreen(const Error error);
+
 /** Loads data from the config file */
 AcResult acLoadConfig(const char* config_path, AcMeshInfo* config);

-/** Updates the built-in parameters based on nx, ny and nz */
-AcResult acUpdateBuiltinParams(AcMeshInfo* config);
+/** */
+AcScalReductionTestCase acCreateScalReductionTestCase(const char* label,
+                                                      const VertexBufferHandle vtxbuf,
+                                                      const ReductionType rtype);

 /** */
-AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
-
-/** */
-AcResult acMeshDestroy(AcMesh* mesh);
+AcVecReductionTestCase acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a,
+                                                    const VertexBufferHandle b,
+                                                    const VertexBufferHandle c,
+                                                    const ReductionType rtype);

 /** */
 AcResult acMeshSet(const AcReal value, AcMesh* mesh);
@@ -56,9 +96,24 @@ AcResult acMeshClear(AcMesh* mesh);
 /** */
 AcResult acModelIntegrateStep(AcMesh mesh, const AcReal dt);

+/** TODO */
+AcReal acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a);
+
+/** TODO */
+AcReal acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
+                        const VertexBufferHandle b, const VertexBufferHandle c);
+
 /** */
 AcResult acVerifyMesh(const AcMesh model, const AcMesh candidate);

+/** */
+AcResult acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases,
+                                const size_t numCases);
+
+/** */
+AcResult acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases,
+                               const size_t numCases);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -39,8 +39,47 @@ typedef enum {
    NUM_TESTS,
 } TestType;

+#include <stdint.h>
+
+typedef struct {
+    uint64_t x, y, z;
+} uint3_64;
+
+static uint3_64
+operator+(const uint3_64& a, const uint3_64& b)
+{
+    return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static uint3_64
+morton3D(const uint64_t pid)
+{
+    uint64_t i, j, k;
+    i = j = k = 0;
+
+    for (int bit = 0; bit <= 21; ++bit) {
+        const uint64_t mask = 0x1l << 3 * bit;
+        k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
+        j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
+        i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
+    }
+
+    return (uint3_64){i, j, k};
+}
+
+static uint3_64
+decompose(const uint64_t target)
+{
+    // This is just so beautifully elegant. Complex and efficient decomposition
+    // in just one line of code.
+    uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
+
+    ERRCHK_ALWAYS(p.x * p.y * p.z == target);
+    return p;
+}
+
 int
-main(void)
+main(int argc, char** argv)
 {
    MPI_Init(NULL, NULL);
    int nprocs, pid;
@@ -51,9 +90,30 @@ main(void)
    AcMeshInfo info;
    acLoadConfig(AC_DEFAULT_CONFIG, &info);

+    if (argc > 1) {
+        if (argc == 4) {
+            const int nx           = atoi(argv[1]);
+            const int ny           = atoi(argv[2]);
+            const int nz           = atoi(argv[3]);
+            info.int_params[AC_nx] = nx;
+            info.int_params[AC_ny] = ny;
+            info.int_params[AC_nz] = nz;
+            acUpdateBuiltinParams(&info);
+            printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
+        }
+        else {
+            fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
    const TestType test = TEST_STRONG_SCALING;
-    if (test == TEST_WEAK_SCALING)
-        info.int_params[AC_nz] *= nprocs;
+    if (test == TEST_WEAK_SCALING) {
+        uint3_64 decomp = decompose(nprocs);
+        info.int_params[AC_nx] *= decomp.x;
+        info.int_params[AC_ny] *= decomp.y;
+        info.int_params[AC_nz] *= decomp.z;
+    }

    /*
    AcMesh model, candidate;
@@ -89,19 +149,45 @@ main(void)
        }
    }*/

+    /*
+    // Basic
+    const size_t num_iters = 100;
+
    // Warmup
-    for (size_t i = 0; i < 10; ++i)
+    for (size_t i = 0; i < num_iters / 10; ++i)
        acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);

    // Benchmark
    Timer t;
-    const AcReal dt             = FLT_EPSILON;
-    const size_t num_iters      = 100;
-    const double nth_percentile = 0.90;
+    const AcReal dt = FLT_EPSILON;

+    acGridSynchronizeStream(STREAM_ALL);
+    timer_reset(&t);
+    acGridSynchronizeStream(STREAM_ALL);
+
+    for (size_t i = 0; i < num_iters; ++i)
+        acGridIntegrate(STREAM_DEFAULT, dt);
+
+    acGridSynchronizeStream(STREAM_ALL);
+    if (!pid)
+        timer_diff_print(t);
+    acGridSynchronizeStream(STREAM_ALL);
+    */
+
+    // Percentiles
+    const size_t num_iters      = 1000;
+    const double nth_percentile = 0.90;
    std::vector<double> results; // ms
    results.reserve(num_iters);

+    // Warmup
+    for (size_t i = 0; i < num_iters / 10; ++i)
+        acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
+
+    // Benchmark
+    Timer t;
+    const AcReal dt = FLT_EPSILON;
+
    for (size_t i = 0; i < num_iters; ++i) {
        acGridSynchronizeStream(STREAM_ALL);
        timer_reset(&t);
@@ -109,9 +195,9 @@ main(void)
        acGridIntegrate(STREAM_DEFAULT, dt);
        acGridSynchronizeStream(STREAM_ALL);
        results.push_back(timer_diff_nsec(t) / 1e6);
+        acGridSynchronizeStream(STREAM_ALL);
    }

-    // Write benchmark to file
    if (!pid) {
        std::sort(results.begin(), results.end(),
                  [](const double& a, const double& b) { return a < b; });
@@ -121,22 +207,58 @@ main(void)
                results[nth_percentile * num_iters], 100 * nth_percentile);

        char path[4096] = "";
-        if (test == TEST_STRONG_SCALING)
-            strncpy(path, "strong_scaling.csv", sizeof(path));
-        else if (test == TEST_WEAK_SCALING)
-            strncpy(path, "weak_scaling.csv", sizeof(path));
-        else
-            ERROR("Invalid test type");
+        sprintf(path, "%s_%d.csv", test == TEST_STRONG_SCALING ? "strong" : "weak", nprocs);

        FILE* fp = fopen(path, "a");
        ERRCHK_ALWAYS(fp);
        // Format
-        // nprocs, measured (ms)
-        fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
-
+        // nprocs, min, 50th perc, 90th perc, max
+        fprintf(fp, "%d, %g, %g, %g, %g\n", nprocs, results[0], results[0.5 * num_iters], results[nth_percentile * num_iters], results[num_iters-1]);
        fclose(fp);
    }

+    /*
+const size_t num_iters      = 1000;
+const double nth_percentile = 0.90;
+
+std::vector<double> results; // ms
+results.reserve(num_iters);
+
+for (size_t i = 0; i < num_iters; ++i) {
+    acGridSynchronizeStream(STREAM_ALL);
+    timer_reset(&t);
+    acGridSynchronizeStream(STREAM_ALL);
+    acGridIntegrate(STREAM_DEFAULT, dt);
+    acGridSynchronizeStream(STREAM_ALL);
+    results.push_back(timer_diff_nsec(t) / 1e6);
+}
+
+// Write benchmark to file
+if (!pid) {
+    std::sort(results.begin(), results.end(),
+              [](const double& a, const double& b) { return a < b; });
+    fprintf(stdout,
+            "Integration step time %g ms (%gth "
+            "percentile)--------------------------------------\n",
+            results[nth_percentile * num_iters], 100 * nth_percentile);
+
+    char path[4096] = "";
+    if (test == TEST_STRONG_SCALING)
+        strncpy(path, "strong_scaling.csv", sizeof(path));
+    else if (test == TEST_WEAK_SCALING)
+        strncpy(path, "weak_scaling.csv", sizeof(path));
+    else
+        ERROR("Invalid test type");
+
+    FILE* fp = fopen(path, "a");
+    ERRCHK_ALWAYS(fp);
+    // Format
+    // nprocs, measured (ms)
+    fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
+
+    fclose(fp);
+}*/
+
    acGridQuit();
    MPI_Finalize();
    return EXIT_SUCCESS;
--- a/samples/bwtest/CMakeLists.txt
+++ b/samples/bwtest/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking
+
+find_package(MPI)
+find_package(OpenMP)
+find_package(CUDAToolkit)
+
+add_executable(bwtest main.c)
+target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
+target_compile_options(bwtest PRIVATE -O3)
--- a/samples/bwtest/main.c
+++ b/samples/bwtest/main.c
@@ -0,0 +1,525 @@
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <mpi.h>
+
+#include <cuda.h> // CUDA driver API
+#include <cuda_runtime_api.h>
+
+#include "timer_hires.h" // From src/common
+
+//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
+#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
+
+#define errchk(x)                                                                                  \
+    {                                                                                              \
+        if (!(x)) {                                                                                \
+            fprintf(stderr, "errchk(%s) failed", #x);                                              \
+            assert(x);                                                                             \
+        }                                                                                          \
+    }
+
+/*
+  Findings:
+    - MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
+    - Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
+    - For internode one should use pinned memory (RDMA is staged through pinned, gives full
+    network speed iff pinned)
+    - Both the sending and receiving arrays must be pinned to see performance improvement
+    in internode comm
+*/
+
+static uint8_t*
+allocHost(const size_t bytes)
+{
+    uint8_t* arr = malloc(bytes);
+    errchk(arr);
+    return arr;
+}
+
+static void
+freeHost(uint8_t* arr)
+{
+    free(arr);
+}
+
+static uint8_t*
+allocDevice(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    errchk(retval == cudaSuccess);
+    return arr;
+}
+
+static uint8_t*
+allocDevicePinned(const size_t bytes)
+{
+#define USE_CUDA_DRIVER_PINNING (1)
+#if USE_CUDA_DRIVER_PINNING
+    uint8_t* arr = allocDevice(bytes);
+
+    unsigned int flag = 1;
+    CUresult retval   = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                            (CUdeviceptr)arr);
+
+    errchk(retval == CUDA_SUCCESS);
+    return arr;
+
+#else
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    errchk(retval == cudaSuccess);
+    return arr;
+#endif
+}
+
+/*
+static uint8_t*
+allocDevicePinned(const size_t bytes)
+{
+    uint8_t* arr;
+    // Standard (20 GiB/s internode, 85 GiB/s intranode)
+    // const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
+    // Unified mem (5 GiB/s internode, 6 GiB/s intranode)
+    // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
+    // Pinned (40 GiB/s internode, 10 GiB/s intranode)
+    const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
+    errchk(retval == cudaSuccess);
+    return arr;
+}*/
+
+static void
+freeDevice(uint8_t* arr)
+{
+    cudaFree(arr);
+}
+
+static void
+sendrecv_blocking(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    if (!pid) {
+        MPI_Status status;
+        MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
+    }
+    else {
+        MPI_Status status;
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
+        MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
+    }
+}
+
+static void
+sendrecv_nonblocking(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    MPI_Request recv_request, send_request;
+    MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request);
+    MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request);
+
+    MPI_Status status;
+    MPI_Wait(&recv_request, &status);
+    MPI_Wait(&send_request, &status);
+}
+
+static void
+sendrecv_twoway(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    int nfront = (pid + 1) % nprocs;
+    int nback  = (((pid - 1) % nprocs) + nprocs) % nprocs;
+
+    MPI_Status status;
+    MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback,
+                 MPI_COMM_WORLD, &status);
+}
+
+static void
+sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        int nback  = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
+        MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&recv_requests[i], &status);
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+
+/*
+static void
+sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    MPI_Request send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    static bool error_shown = false;
+    if (!pid && !error_shown) {
+        fprintf(stderr, "\tWARNING: make sure you init MPI_Init_thread for OpenMP support (no "
+                        "supported on puhti atm "
+                        "2020-04-05\n");
+        error_shown = true;
+    }
+#pragma omp parallel for
+    for (int i = 1; i < nprocs; ++i) {
+        int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        MPI_Status status;
+        MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &status);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+*/
+
+static void
+sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
+{
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    static uint8_t* src_pinned = NULL;
+    static uint8_t* dst_pinned = NULL;
+    if (!src_pinned)
+        src_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
+    if (!dst_pinned)
+        dst_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
+
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+
+    MPI_Request recv_requests[nprocs], send_requests[nprocs];
+    for (int i = 1; i < nprocs; ++i) {
+        int nfront = (pid + i) % nprocs;
+        int nback  = (((pid - i) % nprocs) + nprocs) % nprocs;
+
+        if (nback / devices_per_node != pid / devices_per_node) // Not on the same node
+            MPI_Irecv(dst_pinned, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD,
+                      &recv_requests[i]);
+        else
+            MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
+
+        if (nfront / devices_per_node != pid / devices_per_node) // Not on the same node
+            MPI_Isend(src_pinned, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD,
+                      &send_requests[i]);
+        else
+            MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
+    }
+
+    for (int i = 1; i < nprocs; ++i) {
+        MPI_Status status;
+        MPI_Wait(&recv_requests[i], &status);
+        MPI_Wait(&send_requests[i], &status);
+    }
+}
+
+static void
+send_d2h(uint8_t* src, uint8_t* dst)
+{
+    cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
+}
+
+static void
+send_h2d(uint8_t* src, uint8_t* dst)
+{
+    cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
+}
+
+static void
+sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
+{
+    cudaStream_t d2h, h2d;
+    cudaStreamCreate(&d2h);
+    cudaStreamCreate(&h2d);
+
+    cudaMemcpyAsync(hdst, dsrc, BLOCK_SIZE, cudaMemcpyDeviceToHost, d2h);
+    cudaMemcpyAsync(ddst, hsrc, BLOCK_SIZE, cudaMemcpyHostToDevice, h2d);
+
+    cudaStreamSynchronize(d2h);
+    cudaStreamSynchronize(h2d);
+
+    cudaStreamDestroy(d2h);
+    cudaStreamDestroy(h2d);
+}
+
+#define PRINT                                                                                      \
+    if (!pid)                                                                                      \
+    printf
+
+static void
+measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
+          uint8_t* dst)
+{
+    const size_t num_samples = 100;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    PRINT("%s\n", msg);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("\tWarming up... ");
+    for (size_t i = 0; i < num_samples / 10; ++i)
+        sendrecv(src, dst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    PRINT("Done\n");
+
+    PRINT("\tBandwidth... ");
+    fflush(stdout);
+
+    Timer t;
+    MPI_Barrier(MPI_COMM_WORLD);
+    timer_reset(&t);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (size_t i = 0; i < num_samples; ++i)
+        sendrecv(src, dst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
+    PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
+    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+static void
+measurebw2(const char* msg, const size_t bytes,
+           void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
+           uint8_t* hsrc, uint8_t* ddst)
+{
+    const size_t num_samples = 100;
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    PRINT("%s\n", msg);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("\tWarming up... ");
+    for (size_t i = 0; i < num_samples / 10; ++i)
+        sendrecv(dsrc, hdst, hsrc, ddst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    PRINT("Done\n");
+
+    PRINT("\tBandwidth... ");
+    fflush(stdout);
+
+    Timer t;
+    MPI_Barrier(MPI_COMM_WORLD);
+    timer_reset(&t);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    for (size_t i = 0; i < num_samples; ++i)
+        sendrecv(dsrc, hdst, hsrc, ddst);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
+    PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
+    PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+int
+main(void)
+{
+    MPI_Init(NULL, NULL);
+    // int provided;
+    // MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
+    // errchk(provided >= MPI_THREAD_MULTIPLE);
+
+    // Disable stdout buffering
+    setbuf(stdout, NULL);
+
+    int pid, nprocs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    errchk(nprocs >= 2); // Require at least one neighbor
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (!pid) {
+        printf("Do we have threads? The following should not be ordered (unless very lucky)\n");
+#pragma omp parallel for
+        for (int i = 0; i < 10; ++i)
+            printf("%d, ", i);
+        printf("\n");
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    int devices_per_node = -1;
+    cudaGetDeviceCount(&devices_per_node);
+    const int device_id = pid % devices_per_node;
+    cudaSetDevice(device_id);
+
+    printf("Process %d of %d running.\n", pid, nprocs);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
+
+#if 1
+    {
+        uint8_t* src = allocHost(BLOCK_SIZE);
+        uint8_t* dst = allocHost(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Host)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Host)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
+        //          2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
+
+        freeHost(src);
+        freeHost(dst);
+    }
+    PRINT("\n------------------------\n");
+
+    {
+        uint8_t* src = allocDevice(BLOCK_SIZE);
+        uint8_t* dst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Device)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+        // measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
+        //          2 *  (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
+                  dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+    PRINT("\n------------------------\n");
+
+    {
+        uint8_t* src = allocDevicePinned(BLOCK_SIZE);
+        uint8_t* dst = allocDevicePinned(BLOCK_SIZE);
+
+        measurebw("Unidirectional bandwidth, blocking (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
+        measurebw("Bidirectional bandwidth, async (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
+        measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
+                  2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
+        measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
+                  2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+    PRINT("\n------------------------\n");
+
+    {
+        uint8_t* hsrc = allocHost(BLOCK_SIZE);
+        uint8_t* hdst = allocHost(BLOCK_SIZE);
+        uint8_t* dsrc = allocDevice(BLOCK_SIZE);
+        uint8_t* ddst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
+        measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
+
+        measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
+                   ddst);
+
+        freeDevice(dsrc);
+        freeDevice(ddst);
+        freeHost(hsrc);
+        freeHost(hdst);
+    }
+    PRINT("\n------------------------\n");
+
+    {
+        uint8_t* hsrc = allocHost(BLOCK_SIZE);
+        uint8_t* hdst = allocHost(BLOCK_SIZE);
+        uint8_t* dsrc = allocDevicePinned(BLOCK_SIZE);
+        uint8_t* ddst = allocDevicePinned(BLOCK_SIZE);
+
+        measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
+        measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
+
+        measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
+                   hsrc, ddst);
+
+        freeDevice(dsrc);
+        freeDevice(ddst);
+        freeHost(hsrc);
+        freeHost(hdst);
+    }
+    PRINT("\n------------------------\n");
+#else
+    { // Final run for easy identification with the profiler
+        uint8_t* src = allocDevice(BLOCK_SIZE);
+        uint8_t* dst = allocDevice(BLOCK_SIZE);
+
+        measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
+                  2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
+
+        freeDevice(src);
+        freeDevice(dst);
+    }
+#endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
--- a/samples/fortrantest/CMakeLists.txt
+++ b/samples/fortrantest/CMakeLists.txt
@@ -0,0 +1,4 @@
+enable_language(Fortran)
+
+add_executable(fortrantest main.f90)
+target_link_libraries(fortrantest astaroth_core)
--- a/samples/fortrantest/main.f90
+++ b/samples/fortrantest/main.f90
@@ -0,0 +1,23 @@
+program pc
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+  include "astaroth.f90"
+
+  type(AcMeshInfo) :: info
+  type(c_ptr) :: device
+
+  print *, "Num int params"
+  print *, AC_NUM_INT_PARAMS
+
+  ! Setup config
+  info%int_params(AC_nx + 1) = 128
+  info%int_params(AC_ny + 1) = 128
+  info%int_params(AC_nz + 1) = 128
+  call acupdatebuiltinparams(info)
+
+  call acdevicecreate(0, info, device)
+  call acdeviceprintinfo(device)
+  call acdevicedestroy(device)
+
+end program
--- a/samples/genbenchmarkscripts/CMakeLists.txt
+++ b/samples/genbenchmarkscripts/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(genbenchmarkscripts main.c)
+
+add_custom_command(
+  TARGET genbenchmarkscripts POST_BUILD
+  COMMAND genbenchmarkscripts
+  WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+  COMMENT "Generating benchmark scripts"
+)
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -0,0 +1,62 @@
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main(void)
+{
+    const int max_nprocs = 128;
+    for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
+        char filename[4096];
+        sprintf(filename, "benchmark_%d.sh", nprocs);
+
+        FILE* fp = fopen(filename, "w");
+        assert(fp);
+
+        // Boilerplate
+        fprintf(fp, "#!/bin/bash\n");
+        fprintf(fp, "#BATCH --job-name=astaroth\n");
+        fprintf(fp, "#SBATCH --account=project_2000403\n");
+        fprintf(fp, "#SBATCH --time=00:14:59\n");
+        fprintf(fp, "#SBATCH --mem=32000\n");
+        fprintf(fp, "#SBATCH --partition=gpu\n");
+        fprintf(fp, "#SBATCH --cpus-per-task=10\n");
+
+        // nprocs, nodes, gpus
+        const int max_gpus_per_node = 4;
+        const int gpus_per_node     = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
+        const int nodes             = (int)ceil((double)nprocs / max_gpus_per_node);
+        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
+        fprintf(fp, "#SBATCH -n %d\n", nprocs);
+        fprintf(fp, "#SBATCH -N %d\n", nodes);
+        //fprintf(fp, "#SBATCH --exclusive\n");
+        if (nprocs > 4)
+            fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
+
+        // Modules
+        // OpenMPI
+        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n");
+        // HPCX
+        //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
+
+        // Profile and run
+        //fprintf(fp, "mkdir -p profile_%d\n", nprocs);
+
+        const int nx = 256; // max size 1792;
+        const int ny = nx;
+        const int nz = nx;
+        /*
+        fprintf(fp,
+                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
+                //"%d\n",
+                "srun ./benchmark %d %d %d\n", nx, ny, nz);
+        */
+        fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
+
+        fclose(fp);
+    }
+
+    return EXIT_SUCCESS;
+}
--- a/samples/mpi_reduce_bench/CMakeLists.txt
+++ b/samples/mpi_reduce_bench/CMakeLists.txt
@@ -0,0 +1,3 @@
+## benchmark
+add_executable(mpi_reduce_bench main.cc)
+target_link_libraries(mpi_reduce_bench astaroth_core astaroth_utils)
--- a/samples/mpi_reduce_bench/main.cc
+++ b/samples/mpi_reduce_bench/main.cc
@@ -0,0 +1,169 @@
+/*
+    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+    Running: benchmark -np <num processes> <executable>
+*/
+#include "astaroth.h"
+#include "astaroth_utils.h"
+
+#include "errchk.h"
+#include "timer_hires.h"
+
+#if AC_MPI_ENABLED
+
+#include <mpi.h>
+
+#include <algorithm>
+#include <string.h>
+
+#include <string>
+#include <vector>
+#include <chrono>
+#include <ctime>
+
+int
+main(int argc, char** argv)
+{
+    MPI_Init(NULL, NULL);
+    int nprocs, pid;
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+    // CPU alloc
+    AcMeshInfo info;
+    acLoadConfig(AC_DEFAULT_CONFIG, &info);
+
+    char* benchmark_label;
+
+    if (argc > 1){
+        benchmark_label = argv[1];
+    } else {
+        auto timestamp = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+        benchmark_label = std::ctime(&timestamp);
+        benchmark_label[strcspn(benchmark_label, "\n")] = '\0';
+    }
+    
+    //clang-format off
+    std::vector<AcScalReductionTestCase> scalarReductionTests {
+        acCreateScalReductionTestCase("Scalar MAX"    , VTXBUF_UUX, RTYPE_MAX),
+        acCreateScalReductionTestCase("Scalar MIN"    , VTXBUF_UUX, RTYPE_MIN),
+        acCreateScalReductionTestCase("Scalar RMS"    , VTXBUF_UUX, RTYPE_RMS),
+        acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
+        acCreateScalReductionTestCase("Scalar SUM"    , VTXBUF_UUX, RTYPE_SUM)
+    };
+
+    std::vector<AcVecReductionTestCase> vectorReductionTests {
+        acCreateVecReductionTestCase("Vector MAX"    , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
+        acCreateVecReductionTestCase("Vector MIN"    , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
+        acCreateVecReductionTestCase("Vector RMS"    , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
+        acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP),
+        acCreateVecReductionTestCase("Vector SUM"    , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM)
+    };
+    //clang-format on
+
+    // GPU alloc & compute
+    acGridInit(info);
+
+    const size_t num_iters      = 100;
+    const double nth_percentile = 0.90;
+    std::vector<double> results; // ms
+    Timer t;
+    // Scalar benchmarks
+    for (auto& testCase : scalarReductionTests) {
+        results.clear();
+        results.reserve(num_iters);
+
+        for (size_t i = 0; i < num_iters; ++i) {
+            acGridSynchronizeStream(STREAM_ALL);
+            timer_reset(&t);
+            acGridSynchronizeStream(STREAM_ALL);
+            acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
+            acGridSynchronizeStream(STREAM_ALL);
+            results.push_back(timer_diff_nsec(t) / 1e6);
+            acGridSynchronizeStream(STREAM_ALL);
+        }
+
+        if (!pid) {
+            std::sort(results.begin(), results.end(),
+                      [](const double& a, const double& b) { return a < b; });
+            fprintf(stdout,
+                    "Reduction time %g ms (%gth "
+                    "percentile)--------------------------------------\n",
+                    results[nth_percentile * num_iters], 100 * nth_percentile);
+
+            char path[4096] = "mpi_reduction_benchmark.csv";
+
+            FILE* fp = fopen(path, "a");
+            ERRCHK_ALWAYS(fp);
+            
+            // Format
+            // benchmark label, test label, nprocs, measured (ms)
+            fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
+            fclose(fp);
+        }
+    }
+
+    // Vector benchmarks
+    for (auto& testCase : vectorReductionTests) {
+        results.clear();
+        results.reserve(num_iters);
+
+        for (size_t i = 0; i < num_iters; ++i) {
+            acGridSynchronizeStream(STREAM_ALL);
+            timer_reset(&t);
+            acGridSynchronizeStream(STREAM_ALL);
+            acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c, &testCase.candidate);
+            acGridSynchronizeStream(STREAM_ALL);
+            results.push_back(timer_diff_nsec(t) / 1e6);
+            acGridSynchronizeStream(STREAM_ALL);
+        }
+
+        if (!pid) {
+            std::sort(results.begin(), results.end(),
+                      [](const double& a, const double& b) { return a < b; });
+            fprintf(stdout,
+                    "Reduction time %g ms (%gth "
+                    "percentile)--------------------------------------\n",
+                    results[nth_percentile * num_iters], 100 * nth_percentile);
+
+            char path[4096] = "mpi_reduction_benchmark.csv";
+
+            FILE* fp = fopen(path, "a");
+            ERRCHK_ALWAYS(fp);
+            
+            // Format
+            // benchmark label, test label, nprocs, measured (ms)
+            fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
+            fclose(fp);
+        }
+    }
+    acGridQuit();
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
+
+#else
+int
+main(void)
+{
+    printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
+           "cmake -DMPI_ENABLED=ON .. to enable.\n");
+    return EXIT_FAILURE;
+}
+#endif // AC_MPI_ENABLES
--- a/samples/mpi_reduce_bench/mpibench.sh
+++ b/samples/mpi_reduce_bench/mpibench.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+#defaults
+default_num_procs=8
+default_num_nodes=2
+default_partition=gpu
+
+num_procs=$default_num_procs
+num_nodes=$default_num_nodes
+
+partition=$default_partition
+
+script_name=$0
+
+print_usage(){
+    echo "Usage: $script_name [Options]"
+    echo "      Runs ./mpi_reduce_bench, which will write benchmark results to a csv file"
+    echo "      Remember to run this script from your build directory"
+    echo "      The benchmarks are submitted with sbatch, unless the -i option is passed"
+    echo "Options:"
+    echo "      -n <num_procs>"
+    echo "              number of tasks for slurm, default=$default_num_procs"
+    echo "      -p <partition>"
+    echo "              which partition to use for slurm, default=$default_partition"
+    echo "      -t <tag>"
+    echo "              A benchmark tag that will be added to the mpi_reduction_benchmark.csv file"
+    echo "              By default the current git HEAD short hash will be used as a tag"
+    echo "      -i"
+    echo "              Run the benchmark interactively with srun instead of sbatch"
+    echo "      -h"
+    echo "              Print this message"
+}
+
+while getopts :n:t:p:ih opt
+do
+    case "$opt" in
+        n)
+            num_procs=$OPTARG
+            num_nodes=$(( 1 + ($num_procs - 1)/4))
+        ;;
+        t)
+            benchmark_label=$OPTARG
+        ;;
+        i)
+            interactively=1
+        ;;
+        p)
+            partition=$OPTARG
+        ;;
+        h)
+            print_usage
+            exit 0
+        ;;
+        ?)
+            print_usage
+            exit 1
+    esac
+done
+
+if [ -z "$benchmark_label" ]
+then
+    benchmark_label=$(git rev-parse --short HEAD)
+fi
+set -x
+
+exit 0
+
+if [ -z "$interactively"]
+then
+sbatch <<EOF
+#!/bin/sh
+#BATCH --job-name=astaroth
+#SBATCH --account=project_2000403
+#SBATCH --time=00:14:59
+#SBATCH --mem=48000
+#SBATCH --partition=${partition}
+#SBATCH --gres=gpu:v100:4
+#SBATCH -n ${num_procs}
+#SBATCH -N ${num_nodes}
+srun ./mpi_reduce_bench ${benchmark_label}
+EOF
+else
+    srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p ${partition} -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}
+fi
--- a/samples/mpitest/main.cc
+++ b/samples/mpitest/main.cc
@@ -25,6 +25,7 @@
 #if AC_MPI_ENABLED

 #include <mpi.h>
+#include <vector>

 int
 main(void)
@@ -48,24 +49,65 @@ main(void)

    // GPU alloc & compute
    acGridInit(info);
-    acGridLoadMesh(model, STREAM_DEFAULT);

+    // INTEGRATION TESTS START ---------------------------------------------------------------------
+    acGridLoadMesh(model, STREAM_DEFAULT);
    acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
    acGridPeriodicBoundconds(STREAM_DEFAULT);
-
    acGridStoreMesh(STREAM_DEFAULT, &candidate);
-    acGridQuit();

-    // Verify
    if (pid == 0) {
        acModelIntegrateStep(model, FLT_EPSILON);
        acMeshApplyPeriodicBounds(&model);
-
        acVerifyMesh(model, candidate);
+    }
+    // INTEGRATION TESTS END -----------------------------------------------------------------------
+
+    // REDUCTION TESTS START -----------------------------------------------------------------------
+    acGridLoadMesh(model, STREAM_DEFAULT);
+
+    std::vector<AcScalReductionTestCase> scalarReductionTests{
+        acCreateScalReductionTestCase("Scalar MAX", VTXBUF_UUX, RTYPE_MAX),
+        acCreateScalReductionTestCase("Scalar MIN", VTXBUF_UUX, RTYPE_MIN),
+        /*
+        acCreateScalReductionTestCase("Scalar RMS", VTXBUF_UUX, RTYPE_RMS),
+        acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
+        acCreateScalReductionTestCase("Scalar SUM", VTXBUF_UUX, RTYPE_SUM),
+        */
+    };
+    std::vector<AcVecReductionTestCase> vectorReductionTests{
+        acCreateVecReductionTestCase("Vector MAX", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
+        acCreateVecReductionTestCase("Vector MIN", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
+        /*
+        acCreateVecReductionTestCase("Vector RMS", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
+        acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
+                                     RTYPE_RMS_EXP),
+        acCreateVecReductionTestCase("Vector SUM", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM),
+        */
+    };
+    // False positives due to too strict error bounds, skip the tests until we can determine a
+    // proper error bound
+    fprintf(stderr, "WARNING: RTYPE_RMS, RTYPE_RMS_EXP, and RTYPE_SUM tests skipped\n");
+
+    for (auto& testCase : scalarReductionTests) {
+        acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
+    }
+    for (auto& testCase : vectorReductionTests) {
+        acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c,
+                        &testCase.candidate);
+    }
+    if (pid == 0) {
+        acVerifyScalReductions(model, scalarReductionTests.data(), scalarReductionTests.size());
+        acVerifyVecReductions(model, vectorReductionTests.data(), vectorReductionTests.size());
+    }
+    // REDUCTION TESTS END -------------------------------------------------------------------------
+
+    if (pid == 0) {
        acMeshDestroy(&model);
        acMeshDestroy(&candidate);
    }

+    acGridQuit();
    MPI_Finalize();
    return EXIT_SUCCESS;
 }
@@ -74,7 +116,8 @@ main(void)
 int
 main(void)
 {
-    printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with cmake -DMPI_ENABLED=ON .. to enable.\n");
+    printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
+           "cmake -DMPI_ENABLED=ON .. to enable.\n");
    return EXIT_FAILURE;
 }
 #endif // AC_MPI_ENABLES
--- a/src/common/math_utils.h
+++ b/src/common/math_utils.h
@@ -105,7 +105,8 @@ operator+(const int3& a, const int3& b)
    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
 }

-static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
+static HOST_DEVICE_INLINE int3
+operator*(const int3& a, const int3& b)
 {
    return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
 }
@@ -144,12 +145,20 @@ operator-=(AcReal3& lhs, const AcReal3& rhs)
    lhs.z -= rhs.z;
 }

-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
+static HOST_DEVICE_INLINE int3
+operator*(const int& a, const int3& b)
+{
+    return (int3){a * b.x, a * b.y, a * b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+operator*(const AcReal& a, const AcReal3& b)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }

-static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
+static HOST_DEVICE_INLINE AcReal3
+operator*(const AcReal3& b, const AcReal& a)
 {
    return (AcReal3){a * b.x, a * b.y, a * b.z};
 }
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1,10 +1,14 @@
+find_package(CUDAToolkit)
+
 ## Astaroth Core
-add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
+add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc)
+target_link_libraries(astaroth_core astaroth_kernels CUDA::cudart CUDA::cuda_driver)

 ## Options
 if (MPI_ENABLED)
-    find_package(MPI)
+    #find_package(MPI REQUIRED)
+    #find_package(OpenMP)
+    #target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
    target_link_libraries(astaroth_core MPI::MPI_CXX)
 endif()

--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -158,3 +158,65 @@ acGetNode(void)
    ERRCHK_ALWAYS(num_nodes > 0);
    return nodes[0];
 }
+
+AcResult
+acUpdateBuiltinParams(AcMeshInfo* config)
+{
+    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
+    ///////////// PAD TEST
+    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
+    ///////////// PAD TEST
+    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
+    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
+
+    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
+    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
+    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
+    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
+    config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
+
+// These do not have to be defined by empty projects any more.
+// These should be set only if stdderiv.h is included
+#ifdef AC_dsx
+    config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
+#endif
+#ifdef AC_dsy
+    config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
+#endif
+#ifdef AC_dsz
+    config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
+#endif
+
+    /* Additional helper params */
+    // Int helpers
+    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
+    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
+    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
+{
+    mesh->info = info;
+
+    const size_t bytes = acVertexBufferSizeBytes(mesh->info);
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
+        mesh->vertex_buffer[w] = (AcReal*)malloc(bytes);
+        ERRCHK_ALWAYS(mesh->vertex_buffer[w]);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acMeshDestroy(AcMesh* mesh)
+{
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+        free(mesh->vertex_buffer[w]);
+
+    return AC_SUCCESS;
+}
--- a/src/core/astaroth_fortran.cc
+++ b/src/core/astaroth_fortran.cc
@@ -0,0 +1,119 @@
+#include "astaroth_fortran.h"
+
+#include "astaroth.h"
+#include "errchk.h"
+
+/**
+ * Utils
+ */
+void
+acupdatebuiltinparams_(AcMeshInfo* info)
+{
+    acUpdateBuiltinParams(info);
+}
+
+void
+acgetdevicecount_(int* count)
+{
+    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(count));
+}
+
+/**
+ * Device
+ */
+void
+acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle)
+{
+    acDeviceCreate(*id, *info, handle);
+}
+
+void
+acdevicedestroy_(Device* device)
+{
+    acDeviceDestroy(*device);
+}
+
+void
+acdeviceprintinfo_(const Device* device)
+{
+    acDevicePrintInfo(*device);
+}
+
+void
+acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info)
+{
+    acDeviceLoadMeshInfo(*device, *info);
+}
+
+void
+acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
+                  const int* num_farrays, AcReal* farray)
+{
+    ERRCHK_ALWAYS(*num_farrays >= NUM_VTXBUF_HANDLES);
+    WARNCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
+    const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
+
+    AcMesh mesh;
+    mesh.info = *info;
+    for (int i = 0; i < *num_farrays; ++i)
+        mesh.vertex_buffer[i] = &farray[i * mxyz];
+
+    acDeviceLoadMesh(*device, *stream, mesh);
+}
+
+void
+acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
+                   const int* num_farrays, AcReal* farray)
+{
+    ERRCHK_ALWAYS(*num_farrays >= NUM_VTXBUF_HANDLES);
+    WARNCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
+    AcMesh mesh;
+    mesh.info = *info;
+
+    const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
+    for (int i = 0; i < *num_farrays; ++i)
+        mesh.vertex_buffer[i] = &farray[i * mxyz];
+
+    acDeviceStoreMesh(*device, *stream, &mesh);
+}
+
+void
+acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
+                          const int3* start, const int3* end, const AcReal* dt)
+{
+    acDeviceIntegrateSubstep(*device, *stream, *step_number, *start, *end, *dt);
+}
+
+void
+acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
+                            const int3* end)
+{
+    acDevicePeriodicBoundconds(*device, *stream, *start, *end);
+}
+
+void
+acdeviceswapbuffers_(const Device* device)
+{
+    acDeviceSwapBuffers(*device);
+}
+
+void
+acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
+                    const VertexBufferHandle* vtxbuf_handle, AcReal* result)
+{
+    acDeviceReduceScal(*device, *stream, *rtype, *vtxbuf_handle, result);
+}
+
+void
+acdevicereducevec_(const Device* device, const Stream* stream, const ReductionType* rtype,
+                   const VertexBufferHandle* vtxbuf0, const VertexBufferHandle* vtxbuf1,
+                   const VertexBufferHandle* vtxbuf2, AcReal* result)
+{
+    acDeviceReduceVec(*device, *stream, *rtype, *vtxbuf0, *vtxbuf1, *vtxbuf2, result);
+}
+
+void
+acdevicesynchronizestream_(const Device* device, const Stream* stream)
+{
+    acDeviceSynchronizeStream(*device, *stream);
+}
--- a/src/core/astaroth_fortran.h
+++ b/src/core/astaroth_fortran.h
@@ -0,0 +1,50 @@
+#pragma once
+#include "astaroth.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Utils
+ */
+void acupdatebuiltinparams_(AcMeshInfo* info);
+
+void acgetdevicecount_(int* count);
+
+/**
+ * Device
+ */
+void acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle);
+
+void acdevicedestroy_(Device* device);
+
+void acdeviceprintinfo_(const Device* device);
+
+void acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info);
+
+void acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
+                       const int* num_farrays, AcReal* farray);
+
+void acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
+                        const int* num_farrays, AcReal* farray);
+
+void acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
+                               const int3* start, const int3* end, const AcReal* dt);
+void acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
+                                 const int3* end);
+
+void acdeviceswapbuffers_(const Device* device);
+
+void acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
+                         const VertexBufferHandle* vtxbuf_handle, AcReal* result);
+
+void acdevicereducevec_(const Device* device, const Stream* stream, const ReductionType* rtype,
+                        const VertexBufferHandle* vtxbuf0, const VertexBufferHandle* vtxbuf1,
+                        const VertexBufferHandle* vtxbuf2, AcReal* result);
+
+void acdevicesynchronizestream_(const Device* device, const Stream* stream);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
--- a/src/core/device.cc
+++ b/src/core/device.cc
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -44,7 +44,8 @@ rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
    return (AcReal3){
        rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
        rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
-        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
+        rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt),
+    };
 }

 #define rk3(state_previous, state_current, rate_of_change, dt)                                     \
@@ -192,9 +193,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
        }
    }
 #if VERBOSE_PRINTING
-    printf(
-        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
-        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
+    printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f "
+           "ms\n",
+           best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
 #endif
    /*
    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
@@ -204,6 +205,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
    */

    rk3_tpb = best_dims;
+
+    // Failed to find valid thread block dimensions
+    ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0);
    return AC_SUCCESS;
 }

@@ -211,6 +215,8 @@ AcResult
 acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
                         const int3 end, VertexBufferArray vba)
 {
+    ERRCHK_ALWAYS(step_number >= 0);
+    ERRCHK_ALWAYS(step_number < 3);
    const dim3 tpb = rk3_tpb;

    const int3 n = end - start;
--- a/src/core/kernels/kernels.cu
+++ b/src/core/kernels/kernels.cu
@@ -75,17 +75,20 @@ exp(const acComplex& val)
 {
    return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
 }
-static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
+static __device__ inline acComplex
+operator*(const AcReal& a, const acComplex& b)
 {
    return (acComplex){a * b.x, a * b.y};
 }

-static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
+static __device__ inline acComplex
+operator*(const acComplex& b, const AcReal& a)
 {
    return (acComplex){a * b.x, a * b.y};
 }

-static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
+static __device__ inline acComplex
+operator*(const acComplex& a, const acComplex& b)
 {
    return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
 }
@@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal

    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }

@@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal

    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }

@@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara

    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice, device->streams[stream]));
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
    return AC_SUCCESS;
 }

@@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
    }

    if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
-        fprintf(
-            stderr,
-            "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n",
-            value.x, value.y, value.z, int3param_names[param]);
+        fprintf(stderr,
+                "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. "
+                "Skipping.\n",
+                value.x, value.y, value.z, int3param_names[param]);
        return AC_FAILURE;
    }

@@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device)
 {
    cudaSetDevice(device->id);

-    // clang-format off
+// clang-format off
    // Scalar
    #define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE);
    AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM)
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -1,9 +1,19 @@
 #pragma once
 #include "astaroth.h"

+#if AC_MPI_ENABLED
+#include <mpi.h>
+#include <stdbool.h>
+
+#define MPI_GPUDIRECT_DISABLED (0)
+#endif // AC_MPI_ENABLED
+
 typedef struct {
    int3 dims;
    AcReal* data;
+
+    AcReal* data_pinned;
+    bool pinned = false; // Set if data was received to pinned memory
 } PackedData;

 typedef struct {
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);

-    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
-        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)],
+                                                                   src1[IDX(src_idx)],
+                                                                   src2[IDX(src_idx)]);
 }

 template <ReduceFunc reduce>
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -1,3 +1,3 @@
 ## Astaroth Utils
-add_library(astaroth_utils STATIC config_loader.c memory.c verification.c modelsolver.c)
+add_library(astaroth_utils STATIC config_loader.c memory.c verification.c modelsolver.c modelreduce.c)
 add_dependencies(astaroth_utils dsl_headers)
--- a/src/utils/config_loader.c
+++ b/src/utils/config_loader.c
@@ -74,42 +74,6 @@ parse_config(const char* path, AcMeshInfo* config)
    fclose(fp);
 }

-AcResult
-acUpdateBuiltinParams(AcMeshInfo* config)
-{
-    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
-    ///////////// PAD TEST
-    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
-    ///////////// PAD TEST
-    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
-    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
-
-    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
-    config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
-    config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
-    config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
-    config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
-
-    /*
-    // DEPRECATED: Spacing TODO
-    // These do not have to be defined by empty projects any more.
-    // These should be set only if stdderiv.h is included
-    config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
-    config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
-    config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
-    */
-
-    /* Additional helper params */
-    // Int helpers
-    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
-    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
-    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
-
-    return AC_SUCCESS;
-}
-
 /**
 \brief Loads data from astaroth.conf into a config struct.
 \return AC_SUCCESS on success, AC_FAILURE if there are potentially uninitialized values.
--- a/src/utils/memory.c
+++ b/src/utils/memory.c
@@ -20,29 +20,6 @@

 #include "errchk.h"

-AcResult
-acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
-{
-    mesh->info = info;
-
-    const size_t bytes = acVertexBufferSizeBytes(mesh->info);
-    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
-        mesh->vertex_buffer[w] = malloc(bytes);
-        ERRCHK_ALWAYS(mesh->vertex_buffer[w]);
-    }
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acMeshDestroy(AcMesh* mesh)
-{
-    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
-        free(mesh->vertex_buffer[w]);
-
-    return AC_SUCCESS;
-}
-
 AcResult
 acMeshSet(const AcReal value, AcMesh* mesh)
 {
--- a/src/utils/modelreduce.c
+++ b/src/utils/modelreduce.c
@@ -0,0 +1,209 @@
+/*
+    Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+ #include "astaroth.h"
+
+#include <math.h>
+
+#include "errchk.h"
+
+#if AC_DOUBLE_PRECISION == 0 // HACK TODO fix, make cleaner (purkkaratkaisu)
+#define fabs fabsf
+#define exp expf
+#define sqrt sqrtf
+#endif
+
+// Function pointer definitions
+typedef AcReal (*ReduceFunc)(const AcReal, const AcReal);
+typedef AcReal (*ReduceInitialScalFunc)(const AcReal);
+typedef AcReal (*ReduceInitialVecFunc)(const AcReal, const AcReal,
+                                            const AcReal);
+
+// clang-format off
+/* Comparison funcs */
+static inline AcReal
+max(const AcReal a, const AcReal b) { return a > b ? a : b; }
+
+static inline AcReal
+min(const AcReal a, const AcReal b) { return a < b ? a : b; }
+
+static inline AcReal
+sum(const AcReal a, const AcReal b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+static inline AcReal
+length_scal(const AcReal a) { return (AcReal)(a); }
+
+static inline AcReal
+length_vec(const AcReal a, const AcReal b, const AcReal c) { return sqrt(a*a + b*b + c*c); }
+
+static inline AcReal
+squared_scal(const AcReal a) { return (AcReal)(a*a); }
+
+static inline AcReal
+squared_vec(const AcReal a, const AcReal b, const AcReal c) { return squared_scal(a) + squared_scal(b) + squared_scal(c); }
+
+static inline AcReal
+exp_squared_scal(const AcReal a) { return exp(a)*exp(a); }
+
+static inline AcReal
+exp_squared_vec(const AcReal a, const AcReal b, const AcReal c) { return exp_squared_scal(a) + exp_squared_scal(b) + exp_squared_scal(c); }
+// clang-format on
+
+AcReal
+acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a)
+{
+    ReduceInitialScalFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length_scal;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length_scal;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared_scal;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared_scal;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_SUM:
+        reduce_initial = length_scal;
+        reduce         = sum;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
+                                              mesh.info.int_params[AC_ny_min],
+                                              mesh.info.int_params[AC_nz_min], mesh.info);
+
+    AcReal res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx]);
+    else
+        res = 0;
+
+    for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; ++k) {
+        for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; ++j) {
+            for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
+                 ++i) {
+                const int idx              = acVertexBufferIdx(i, j, k, mesh.info);
+                const AcReal curr_val = reduce_initial(mesh.vertex_buffer[a][idx]);
+                res                        = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const AcReal inv_n = (AcReal)1.0 / mesh.info.int_params[AC_nxyz];
+        return sqrt(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
+
+AcReal
+acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
+                 const VertexBufferHandle b, const VertexBufferHandle c)
+{
+    // AcReal (*reduce_initial)(AcReal, AcReal, AcReal);
+    ReduceInitialVecFunc reduce_initial;
+    ReduceFunc reduce;
+
+    bool solve_mean = false;
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        reduce_initial = length_vec;
+        reduce         = max;
+        break;
+    case RTYPE_MIN:
+        reduce_initial = length_vec;
+        reduce         = min;
+        break;
+    case RTYPE_RMS:
+        reduce_initial = squared_vec;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_RMS_EXP:
+        reduce_initial = exp_squared_vec;
+        reduce         = sum;
+        solve_mean     = true;
+        break;
+    case RTYPE_SUM:
+        reduce_initial = length_vec;
+        reduce         = sum;
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
+                                              mesh.info.int_params[AC_ny_min],
+                                              mesh.info.int_params[AC_nz_min], mesh.info);
+
+    AcReal res;
+    if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx], mesh.vertex_buffer[b][initial_idx],
+                             mesh.vertex_buffer[c][initial_idx]);
+    else
+        res = 0;
+
+    for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; k++) {
+        for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; j++) {
+            for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
+                 i++) {
+                const int idx              = acVertexBufferIdx(i, j, k, mesh.info);
+                const AcReal curr_val = reduce_initial(mesh.vertex_buffer[a][idx],
+                                                            mesh.vertex_buffer[b][idx],
+                                                            mesh.vertex_buffer[c][idx]);
+                res                        = reduce(res, curr_val);
+            }
+        }
+    }
+
+    if (solve_mean) {
+        const AcReal inv_n = (AcReal)1.0 / mesh.info.int_params[AC_nxyz];
+        return sqrt(inv_n * res);
+    }
+    else {
+        return res;
+    }
+}
--- a/src/utils/modelsolver.c
+++ b/src/utils/modelsolver.c
@@ -38,9 +38,10 @@
 #define LMAGNETIC (1)
 #define LENTROPY (1)
 #define LTEMPERATURE (0)
-#define LFORCING (0)
+#define LFORCING (1)
 #define LUPWD (1)
 #define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
+#define R_PI ((Scalar)M_PI)

 typedef AcReal Scalar;
 // typedef AcReal3 Vector;
@@ -54,6 +55,8 @@ typedef float Vector __attribute__((vector_size(4 * sizeof(float))));
 #define fabs fabsf
 #define exp expf
 #define sqrt sqrtf
+#define cos cosf
+#define sin sinf
 #endif

 typedef struct {
@@ -103,11 +106,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds)
 #elif STENCIL_ORDER == 4
    const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
 #elif STENCIL_ORDER == 6
-    const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0),
-                                   (Scalar)(1.0 / 60.0)};
+    const Scalar coefficients[] = {
+        0,
+        (Scalar)(3.0 / 4.0),
+        (Scalar)(-3.0 / 20.0),
+        (Scalar)(1.0 / 60.0),
+    };
 #elif STENCIL_ORDER == 8
-    const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0),
-                                   (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)};
+    const Scalar coefficients[] = {
+        0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0),
+    };
 #endif

 #define MID (STENCIL_ORDER / 2)
@@ -126,15 +134,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds)
 #if STENCIL_ORDER == 2
    const Scalar coefficients[] = {-2, 1};
 #elif STENCIL_ORDER == 4
-    const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0),
-                                   (Scalar)(-1.0 / 12.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-5.0 / 2.0),
+        (Scalar)(4.0 / 3.0),
+        (Scalar)(-1.0 / 12.0),
+    };
 #elif STENCIL_ORDER == 6
-    const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0),
-                                   (Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-49.0 / 18.0),
+        (Scalar)(3.0 / 2.0),
+        (Scalar)(-3.0 / 20.0),
+        (Scalar)(1.0 / 90.0),
+    };
 #elif STENCIL_ORDER == 8
-    const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),
-                                   (Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0),
-                                   (Scalar)(-1.0 / 560.0)};
+    const Scalar coefficients[] = {
+        (Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),    (Scalar)(-1.0 / 5.0),
+        (Scalar)(8.0 / 315.0),   (Scalar)(-1.0 / 560.0),
+    };
 #endif

 #define MID (STENCIL_ORDER / 2)
@@ -156,16 +172,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in
    const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
 #elif STENCIL_ORDER == 4
    const Scalar coefficients[] = {
-        0, (Scalar)(1.0 / 32.0),
-        (Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders
+        0,
+        (Scalar)(1.0 / 32.0),
+        (Scalar)(1.0 / 64.0),
+    }; // TODO correct coefficients, these are just placeholders
 #elif STENCIL_ORDER == 6
    const Scalar fac            = ((Scalar)(1. / 720.));
-    const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac,
-                                   (Scalar)(2.0) * fac};
+    const Scalar coefficients[] = {
+        0 * fac,
+        (Scalar)(270.0) * fac,
+        (Scalar)(-27.0) * fac,
+        (Scalar)(2.0) * fac,
+    };
 #elif STENCIL_ORDER == 8
    const Scalar fac            = ((Scalar)(1. / 20160.));
-    const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac,
-                                   (Scalar)(128.) * fac, (Scalar)(-9.) * fac};
+    const Scalar coefficients[] = {
+        0 * fac,
+        (Scalar)(8064.) * fac,
+        (Scalar)(-1008.) * fac,
+        (Scalar)(128.) * fac,
+        (Scalar)(-9.) * fac,
+    };
 #endif

 #define MID (STENCIL_ORDER / 2)
@@ -207,14 +234,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr)
    Scalar pencil_a[STENCIL_ORDER + 1];
    //#pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2,
-                                   k)];
+        pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
+                                   j + offset - STENCIL_ORDER / 2, k)];

    Scalar pencil_b[STENCIL_ORDER + 1];
    //#pragma unroll
    for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
-        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset,
-                                   k)];
+        pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
+                                   j + STENCIL_ORDER / 2 - offset, k)];

    return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
 }
@@ -539,7 +566,8 @@ gradient_of_divergence(const VectorData vec)
    return (Vector){
        hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
        hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
-        hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]};
+        hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2],
+    };
 }

 // Takes uu gradients and returns S
@@ -777,9 +805,9 @@ Vector
 helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
 {
    (void)magnitude; // WARNING: unused
-    xx[0] = xx[0] * (2.0 * M_PI / (getReal(AC_dsx) * getInt(AC_nx)));
-    xx[1] = xx[1] * (2.0 * M_PI / (getReal(AC_dsy) * getInt(AC_ny)));
-    xx[2] = xx[2] * (2.0 * M_PI / (getReal(AC_dsz) * getInt(AC_nz)));
+    xx[0] = xx[0] * ((Scalar)2.0 * R_PI / (getReal(AC_dsx) * getInt(AC_nx)));
+    xx[1] = xx[1] * ((Scalar)2.0 * R_PI / (getReal(AC_dsy) * getInt(AC_ny)));
+    xx[2] = xx[2] * ((Scalar)2.0 * R_PI / (getReal(AC_dsz) * getInt(AC_nz)));

    Scalar cos_phi     = cos(phi);
    Scalar sin_phi     = sin(phi);
@@ -805,10 +833,11 @@ forcing(int3 globalVertexIdx, Scalar dt)
                                       getInt(AC_ny) * getReal(AC_dsy),
                                       getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
    (void)a;                                                             // WARNING: not used
-    Vector xx        = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
-                         (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
-                         (globalVertexIdx.z - getInt(AC_nz_min)) *
-                             getReal(AC_dsz)}; // sink (current index)
+    Vector xx = (Vector){
+        (globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
+        (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
+        (globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz),
+    }; // sink (current index)
    const Scalar cs2 = getReal(AC_cs2_sound);
    const Scalar cs  = sqrt(cs2);

--- a/src/utils/verification.c
+++ b/src/utils/verification.c
@@ -2,6 +2,7 @@

 #include <math.h>
 #include <stdbool.h>
+#include <string.h>

 #define max(a, b) ((a) > (b) ? (a) : (b))
 #define min(a, b) ((a) < (b) ? (a) : (b))
@@ -18,25 +19,14 @@
 #define WHT "\x1B[37m"
 #define RESET "\x1B[0m"

-typedef struct {
-    VertexBufferHandle handle;
-    AcReal model;
-    AcReal candidate;
-    long double abs_error;
-    long double ulp_error;
-    long double rel_error;
-    AcReal maximum_magnitude;
-    AcReal minimum_magnitude;
-} Error;
-
 static inline bool
 is_valid(const AcReal a)
 {
    return !isnan(a) && !isinf(a);
 }

-static Error
-get_error(AcReal model, AcReal candidate)
+Error
+acGetError(AcReal model, AcReal candidate)
 {
    Error error;
    error.abs_error = 0;
@@ -109,13 +99,15 @@ get_max_abs_error(const VertexBufferHandle vtxbuf_handle, const AcMesh model_mes

    for (size_t i = 0; i < acVertexBufferSize(model_mesh.info); ++i) {

-        Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
+        Error curr_error = acGetError(model_vtxbuf[i], candidate_vtxbuf[i]);

        if (curr_error.abs_error > error.abs_error)
            error = curr_error;
    }

-    error.handle            = vtxbuf_handle;
+    error.handle = vtxbuf_handle;
+    strncpy(error.label, vtxbuf_names[vtxbuf_handle], ERROR_LABEL_LENGTH - 1);
+    error.label[ERROR_LABEL_LENGTH - 1] = '\0';
    error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
    error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);

@@ -147,12 +139,12 @@ is_acceptable(const Error error)
        return false;
 }

-static bool
-print_error_to_screen(const Error error)
+bool
+printErrorToScreen(const Error error)
 {
    bool errors_found = false;

-    printf("\t%-15s... ", vtxbuf_names[error.handle]);
+    printf("\t%-15s... ", error.label);
    if (is_acceptable(error)) {
        printf(GRN "OK! " RESET);
    }
@@ -177,14 +169,87 @@ acVerifyMesh(const AcMesh model, const AcMesh candidate)
    bool errors_found = false;
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        Error field_error = get_max_abs_error(i, model, candidate);
-        errors_found |= print_error_to_screen(field_error);
+        errors_found |= printErrorToScreen(field_error);
    }

    printf("%s\n", errors_found ? "Failure. Found errors in one or more vertex buffers"
                                : "Success. No errors found.");

-    if (errors_found)
-        return AC_FAILURE;
-    else
-        return AC_SUCCESS;
+    return errors_found ? AC_FAILURE : AC_SUCCESS;
+}
+
+/** Verification function for scalar reductions*/
+AcResult
+acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases,
+                       const size_t numCases)
+{
+    printf("\nTesting scalar reductions:\n");
+
+    bool errors_found = false;
+    for (size_t i = 0; i < numCases; i++) {
+        AcReal model_reduction = acModelReduceScal(model, testCases[i].rtype, testCases[i].vtxbuf);
+        Error error            = acGetError(model_reduction, testCases[i].candidate);
+        strncpy(error.label, testCases[i].label, ERROR_LABEL_LENGTH - 1);
+        error.label[ERROR_LABEL_LENGTH - 1] = '\0';
+        errors_found |= printErrorToScreen(error);
+    }
+    printf("%s\n", errors_found ? "Failure. Found errors in one or more scalar reductions"
+                                : "Success. No errors found.");
+
+    return errors_found ? AC_FAILURE : AC_SUCCESS;
+}
+
+/** Verification function for vector reductions*/
+AcResult
+acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases,
+                      const size_t numCases)
+{
+    printf("\nTesting vector reductions:\n");
+
+    bool errors_found = false;
+    for (size_t i = 0; i < numCases; i++) {
+        AcReal model_reduction = acModelReduceVec(model, testCases[i].rtype, testCases[i].a,
+                                                  testCases[i].b, testCases[i].c);
+        Error error            = acGetError(model_reduction, testCases[i].candidate);
+        strncpy(error.label, testCases[i].label, ERROR_LABEL_LENGTH - 1);
+        error.label[ERROR_LABEL_LENGTH - 1] = '\0';
+        errors_found |= printErrorToScreen(error);
+    }
+    printf("%s\n", errors_found ? "Failure. Found errors in one or more vector reductions"
+                                : "Success. No errors found.");
+
+    return errors_found ? AC_FAILURE : AC_SUCCESS;
+}
+
+/** Constructor for scalar reduction test case */
+AcScalReductionTestCase
+acCreateScalReductionTestCase(const char* label, const VertexBufferHandle vtxbuf, const ReductionType rtype)
+{
+    AcScalReductionTestCase testCase;
+
+    strncpy(testCase.label,label,ERROR_LABEL_LENGTH - 1);
+    testCase.label[ERROR_LABEL_LENGTH - 1] = '\0';
+    testCase.vtxbuf = vtxbuf;
+    testCase.rtype = rtype;
+    testCase.candidate = 0;
+
+    return testCase;
+}
+
+/** Constructor for vector reduction test case */
+AcVecReductionTestCase
+acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a,
+                const VertexBufferHandle b, const VertexBufferHandle c, const ReductionType rtype)
+{
+    AcVecReductionTestCase testCase;
+
+    strncpy(testCase.label,label,ERROR_LABEL_LENGTH - 1);
+    testCase.label[ERROR_LABEL_LENGTH - 1] = '\0';
+    testCase.a = a;
+    testCase.b = b;
+    testCase.c = c;
+    testCase.rtype = rtype;
+    testCase.candidate = 0;
+
+    return testCase;
 }