Merge branch 'master' into sink_20190723

Hopefully the merge isssues were resolved.
2019-09-02 11:58:48 +08:00
parent 083ff59ed1 c0ba08133d
commit 6eeb225924
29 changed files with 1364 additions and 592 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -36,6 +36,21 @@ const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 static const int num_nodes = 1;
 static Node nodes[num_nodes];

+void
+acPrintMeshInfo(const AcMeshInfo config)
+{
+    for (int i = 0; i < NUM_INT_PARAMS; ++i)
+        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
+    for (int i = 0; i < NUM_INT3_PARAMS; ++i)
+        printf("[%s]: (%d, %d, %d)\n", int3param_names[i], config.int3_params[i].x,
+               config.int3_params[i].y, config.int3_params[i].z);
+    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
+        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
+    for (int i = 0; i < NUM_REAL3_PARAMS; ++i)
+        printf("[%s]: (%g, %g, %g)\n", real3param_names[i], double(config.real3_params[i].x),
+               double(config.real3_params[i].y), double(config.real3_params[i].z));
+}
+
 AcResult
 acInit(const AcMeshInfo mesh_info)
 {
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -39,35 +39,54 @@ typedef struct {
    AcReal* out[NUM_VTXBUF_HANDLES];
 } VertexBufferArray;

+struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+
+#if PACKED_DATA_TRANSFERS
+// Declare memory for buffers needed for packed data transfers here
+// AcReal* data_packing_buffer;
+#endif
+};
+
 __constant__ AcMeshInfo d_mesh_info;
-static inline int __device__
+static int __device__ __forceinline__
 DCONST(const AcIntParam param)
 {
    return d_mesh_info.int_params[param];
 }
-static inline int3 __device__
+static int3 __device__ __forceinline__
 DCONST(const AcInt3Param param)
 {
    return d_mesh_info.int3_params[param];
 }
-static inline AcReal __device__
+static AcReal __device__ __forceinline__
 DCONST(const AcRealParam param)
 {
    return d_mesh_info.real_params[param];
 }
-static inline AcReal3 __device__
+static AcReal3 __device__ __forceinline__
 DCONST(const AcReal3Param param)
 {
    return d_mesh_info.real3_params[param];
 }
+constexpr VertexBufferHandle
+DCONST(const VertexBufferHandle handle)
+{
+    return handle;
+}
 #define DCONST_INT(x) DCONST(x)
 #define DCONST_INT3(x) DCONST(x)
 #define DCONST_REAL(x) DCONST(x)
 #define DCONST_REAL3(x) DCONST(x)
-//#define DCONST_INT(X) (d_mesh_info.int_params[X])
-//#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
-//#define DCONST_REAL(X) (d_mesh_info.real_params[X])
-//#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
 #define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
@@ -88,26 +107,8 @@ static dim3 rk3_tpb(32, 1, 4);
 // #include "kernels/pack_unpack.cuh"
 #endif

-struct device_s {
-    int id;
-    AcMeshInfo local_config;
-
-    // Concurrency
-    cudaStream_t streams[NUM_STREAM_TYPES];
-
-    // Memory
-    VertexBufferArray vba;
-    AcReal* reduce_scratchpad;
-    AcReal* reduce_result;
-
-#if PACKED_DATA_TRANSFERS
-// Declare memory for buffers needed for packed data transfers here
-// AcReal* data_packing_buffer;
-#endif
-};
-
 // clang-format off
-static __global__ void dummy_kernel(void) {}
+static __global__ void dummy_kernel(void) { DCONST((AcIntParam)0); DCONST((AcInt3Param)0); DCONST((AcRealParam)0); DCONST((AcReal3Param)0); }
 // clang-format on

 AcResult
@@ -153,8 +154,7 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
 #endif

    // Device constants
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
-                                          cudaMemcpyHostToDevice));
+    acDeviceLoadMeshInfo(device, STREAM_DEFAULT, device_config);

    printf("Created device %d (%p)\n", device->id, device);
    *device_handle = device;
@@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device)

                cudaEventRecord(tstart); // ---------------------------------------- Timing start

+                acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
                for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
+                    solve<2><<<bpg, tpb>>>(start, end, device->vba);

                cudaEventRecord(tstop); // ----------------------------------------- Timing end
                cudaEventSynchronize(tstop);
@@ -361,8 +362,8 @@ acDeviceSwapBuffers(const Device device)
 }

 AcResult
-acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
-                     const AcReal value)
+acDeviceLoadScalarConstant(const Device device, const Stream stream, const AcRealParam param,
+                           const AcReal value)
 {
    cudaSetDevice(device->id);
    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
@@ -371,6 +372,55 @@ acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam
    return AC_SUCCESS;
 }

+AcResult
+acDeviceLoadVectorConstant(const Device device, const Stream stream, const AcReal3Param param,
+                           const AcReal3 value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadIntConstant(const Device device, const Stream stream, const AcIntParam param,
+                        const int value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadInt3Constant(const Device device, const Stream stream, const AcInt3Param param,
+                         const int3 value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.int3_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadMeshInfo(const Device device, const Stream stream, const AcMeshInfo device_config)
+{
+    cudaSetDevice(device->id);
+
+    ERRCHK_ALWAYS(device_config.int_params[AC_nx] == device->local_config.int_params[AC_nx]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_ny] == device->local_config.int_params[AC_ny]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_nz] == device->local_config.int_params[AC_nz]);
+    ERRCHK_ALWAYS(device_config.int_params[AC_multigpu_offset] ==
+                  device->local_config.int_params[AC_multigpu_offset]);
+
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &device_config, sizeof(device_config),
+                                               0, cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
 AcResult
 acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
                                   const VertexBufferHandle vtxbuf_handle, const int3 src,
@@ -551,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                   (unsigned int)ceil(n.z / AcReal(tpb.z)));

+    acDeviceLoadScalarConstant(device, stream, AC_dt, dt);
    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
    else
-        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
+        solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);

    ERRCHK_CUDA_KERNEL();

--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -26,6 +26,8 @@
 */
 #pragma once

+#include "src/core/math_utils.h"
+
 #include <assert.h>

 static __device__ __forceinline__ int
@@ -321,65 +323,6 @@ read_data(const int i, const int j, const int k,
 * =============================================================================
 */

-static __host__ __device__ __forceinline__ AcReal3
-operator-(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-operator+(const AcReal3& a, const AcReal3& b)
-{
-    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-operator-(const AcReal3& a)
-{
-    return (AcReal3){-a.x, -a.y, -a.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal3 operator*(const AcReal a, const AcReal3& b)
-{
-    return (AcReal3){a * b.x, a * b.y, a * b.z};
-}
-
-static __host__ __device__ __forceinline__ AcReal
-dot(const AcReal3& a, const AcReal3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-mul(const AcMatrix& aa, const AcReal3& x)
-{
-    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
-}
-
-static __host__ __device__ __forceinline__ AcReal3
-cross(const AcReal3& a, const AcReal3& b)
-{
-    AcReal3 c;
-
-    c.x = a.y * b.z - a.z * b.y;
-    c.y = a.z * b.x - a.x * b.z;
-    c.z = a.x * b.y - a.y * b.x;
-
-    return c;
-}
-
-static __host__ __device__ __forceinline__ bool
-is_valid(const AcReal& a)
-{
-    return !isnan(a) && !isinf(a);
-}
-
-static __host__ __device__ __forceinline__ bool
-is_valid(const AcReal3& a)
-{
-    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
-}
-
 /*
 * =============================================================================
 * Level 1 (Stencil Processing Stage)
@@ -642,4 +585,54 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
                                                                                                   \
    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);

+// clang-format off
+#define GEN_DEVICE_FUNC_HOOK(identifier)                                                           \
+    template <int step_number>                                                                     \
+    AcResult acDeviceKernel_##identifier(const Device device, const Stream stream,                 \
+                                         const int3 start, const int3 end)                         \
+    {                                                                                              \
+        cudaSetDevice(device->id);                                                                 \
+                                                                                                   \
+        const dim3 tpb(32, 1, 4);                                                                  \
+                                                                                                   \
+        const int3 n = end - start;                                                                \
+        const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)),                                    \
+                       (unsigned int)ceil(n.y / AcReal(tpb.y)),                                    \
+                       (unsigned int)ceil(n.z / AcReal(tpb.z)));                                   \
+                                                                                                   \
+        identifier<step_number>                                                                    \
+            <<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);                   \
+        ERRCHK_CUDA_KERNEL();                                                                      \
+                                                                                                   \
+        return AC_SUCCESS;                                                                         \
+    }
+
+/*
+#define GEN_NODE_FUNC_HOOK(identifier)                                                             \
+    template <int step_number>                                                                     \
+    AcResult acNodeKernel_##identifier(const Node node, const Stream stream, const int3 start,     \
+                                       const int3 end)                                             \
+    {                                                                                              \
+        acNodeSynchronizeStream(node, stream);                                                     \
+                                                                                                   \
+        for (int i = 0; i < node->num_devices; ++i) {                                              \
+                                                                                                   \
+            const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};                \
+            const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};  \
+                                                                                                   \
+            const int3 da = max(start, d0);                                                        \
+            const int3 db = min(end, d1);                                                          \
+                                                                                                   \
+            if (db.z >= da.z) {                                                                    \
+                const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};                    \
+                const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};                    \
+                acDeviceKernel_ #identifier(node->devices[i], stream, isubstep, da_local,          \
+                                            db_local, dt);                                         \
+            }                                                                                      \
+        }                                                                                          \
+        return AC_SUCCESS;                                                                         \
+    }
+    */
+// clang-format on
+
 #include "stencil_process.cuh"
--- a/src/core/math_utils.h
+++ b/src/core/math_utils.h
@@ -25,10 +25,10 @@
 *
 */
 #pragma once
-#include <cmath>
-using namespace std; // Potentially bad practice to declare namespace std here
-// #include <math.h>   // isnan, isinf // Overloads incorrect/bugged with GCC <= 6.0
-// #include <tgmath.h> // Even this does not work
+//#include <cmath>
+// using namespace std; // Potentially bad practice to declare namespace std here
+#include <math.h> // isnan, isinf // Overloads incorrect/bugged with GCC <= 6.0
+//#include <tgmath.h> // Even this does not work
 #include <stdlib.h> // rand

 template <class T>
@@ -64,16 +64,6 @@ sum(const T& a, const T& b)
    return a + b;
 }

-template <class T>
-static inline const T
-is_valid(const T& val)
-{
-    if (isnan(val) || isinf(val))
-        return false;
-    else
-        return true;
-}
-
 template <class T>
 static inline const T
 clamp(const T& val, const T& min, const T& max)
@@ -87,20 +77,85 @@ randr()
    return AcReal(rand()) / AcReal(RAND_MAX);
 }

-static inline int3
-operator+(const int3& a, const int3& b)
-{
-    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-static inline int3
-operator-(const int3& a, const int3& b)
-{
-    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
 static inline bool
 is_power_of_two(const unsigned val)
 {
    return val && !(val & (val - 1));
 }
+
+#ifdef __CUDACC__
+#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#else
+#define HOST_DEVICE_INLINE inline
+#endif // __CUDACC__
+
+static HOST_DEVICE_INLINE AcReal3
+operator+(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static HOST_DEVICE_INLINE int3
+operator+(const int3& a, const int3& b)
+{
+    return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+operator-(const AcReal3& a, const AcReal3& b)
+{
+    return (AcReal3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static HOST_DEVICE_INLINE int3
+operator-(const int3& a, const int3& b)
+{
+    return (int3){a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+operator-(const AcReal3& a)
+{
+    return (AcReal3){-a.x, -a.y, -a.z};
+}
+
+static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
+{
+    return (AcReal3){a * b.x, a * b.y, a * b.z};
+}
+
+static HOST_DEVICE_INLINE AcReal
+dot(const AcReal3& a, const AcReal3& b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+static HOST_DEVICE_INLINE AcReal3
+mul(const AcMatrix& aa, const AcReal3& x)
+{
+    return (AcReal3){dot(aa.row[0], x), dot(aa.row[1], x), dot(aa.row[2], x)};
+}
+
+static HOST_DEVICE_INLINE AcReal3
+cross(const AcReal3& a, const AcReal3& b)
+{
+    AcReal3 c;
+
+    c.x = a.y * b.z - a.z * b.y;
+    c.y = a.z * b.x - a.x * b.z;
+    c.z = a.x * b.y - a.y * b.x;
+
+    return c;
+}
+
+static HOST_DEVICE_INLINE bool
+is_valid(const AcReal a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
+static HOST_DEVICE_INLINE bool
+is_valid(const AcReal3& a)
+{
+    return is_valid(a.x) && is_valid(a.y) && is_valid(a.z);
+}
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -429,7 +429,7 @@ acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param
    acNodeSynchronizeStream(node, stream);
    // #pragma omp parallel for
    for (int i = 0; i < node->num_devices; ++i) {
-        acDeviceLoadConstant(node->devices[i], stream, param, value);
+        acDeviceLoadScalarConstant(node->devices[i], stream, param, value);
    }
    return AC_SUCCESS;
 }
--- a/src/mpitest/CMakeLists.txt
+++ b/src/mpitest/CMakeLists.txt
@@ -8,5 +8,5 @@ set(CMAKE_C_STANDARD_REQUIRED ON)
 find_package(MPI REQUIRED)

 add_executable(mpitest main.c)
-target_include_directories(mpitest PRIVATE ${MPI_C_INCLUDE_PATH})
-target_link_libraries(mpitest PRIVATE ${MPI_C_LIBRARIES} astaroth_core)
+target_include_directories(mpitest PRIVATE ${CMAKE_SOURCE_DIR}/src/standalone ${MPI_C_INCLUDE_PATH})
+target_link_libraries(mpitest astaroth_core astaroth_standalone ${MPI_C_LIBRARIES})
--- a/src/mpitest/main.c
+++ b/src/mpitest/main.c
@@ -16,13 +16,120 @@
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
+/**
+    Running: mpirun -np <num processes> <executable>
+*/
+#undef NDEBUG // Assert always
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>

 #include "astaroth.h"
+#include "autotest.h"

 #include <mpi.h>

+// From Astaroth Standalone
+#include "config_loader.h"
+#include "model/host_memory.h"
+
+static void
+distribute_mesh(const AcMesh* src, AcMesh* dst)
+{
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int process_id, num_processes;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
+
+    const size_t count = acVertexBufferSize(dst->info);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+
+        // Communicate to self
+        if (process_id == 0) {
+            assert(src);
+            assert(dst);
+            memcpy(&dst->vertex_buffer[i][0], //
+                   &src->vertex_buffer[i][0], //
+                   count * sizeof(src->vertex_buffer[i][0]));
+        }
+        // Communicate to others
+        for (int j = 1; j < num_processes; ++j) {
+            if (process_id == 0) {
+                assert(src);
+
+                // Send
+                // TODO RECHECK THESE j INDICES
+                const size_t src_idx = j * dst->info.int_params[AC_mx] *
+                                       dst->info.int_params[AC_my] * src->info.int_params[AC_nz] /
+                                       num_processes;
+
+                MPI_Send(&src->vertex_buffer[i][src_idx], count, datatype, j, 0, MPI_COMM_WORLD);
+            }
+            else {
+                assert(dst);
+
+                // Recv
+                const size_t dst_idx = 0;
+                MPI_Status status;
+                MPI_Recv(&dst->vertex_buffer[i][dst_idx], count, datatype, 0, 0, MPI_COMM_WORLD,
+                         &status);
+            }
+        }
+    }
+}
+
+static void
+gather_mesh(const AcMesh* src, AcMesh* dst)
+{
+    MPI_Datatype datatype = MPI_FLOAT;
+    if (sizeof(AcReal) == 8)
+        datatype = MPI_DOUBLE;
+
+    int process_id, num_processes;
+    MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
+
+    size_t count = acVertexBufferSize(src->info);
+
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        // Communicate to self
+        if (process_id == 0) {
+            assert(src);
+            assert(dst);
+            memcpy(&dst->vertex_buffer[i][0], //
+                   &src->vertex_buffer[i][0], //
+                   count * sizeof(AcReal));
+        }
+
+        // Communicate to others
+        for (int j = 1; j < num_processes; ++j) {
+            if (process_id == 0) {
+                // Recv
+                // const size_t dst_idx = j * acVertexBufferCompdomainSize(dst->info);
+                const size_t dst_idx = j * dst->info.int_params[AC_mx] *
+                                       dst->info.int_params[AC_my] * dst->info.int_params[AC_nz] /
+                                       num_processes;
+
+                assert(dst_idx + count <= acVertexBufferSize(dst->info));
+                MPI_Status status;
+                MPI_Recv(&dst->vertex_buffer[i][dst_idx], count, datatype, j, 0, MPI_COMM_WORLD,
+                         &status);
+            }
+            else {
+                // Send
+                const size_t src_idx = 0;
+
+                assert(src_idx + count <= acVertexBufferSize(src->info));
+                MPI_Send(&src->vertex_buffer[i][src_idx], count, datatype, 0, 0, MPI_COMM_WORLD);
+            }
+        }
+    }
+}
+
 int
 main(void)
 {
@@ -37,14 +144,39 @@ main(void)
    MPI_Get_processor_name(processor_name, &name_len);
    printf("Processor %s. Process %d of %d.\n", processor_name, process_id, num_processes);

-    AcMeshInfo info = {
-        .int_params[AC_nx] = 128,
-        .int_params[AC_ny] = 64,
-        .int_params[AC_nz] = 32,
-    };
-    acInit(info);
-    acIntegrate(0.1f);
-    acQuit();
+    AcMeshInfo mesh_info;
+    load_config(&mesh_info);
+    update_config(&mesh_info);
+
+    AcMesh* main_mesh     = NULL;
+    ModelMesh* model_mesh = NULL;
+    if (process_id == 0) {
+        main_mesh = acmesh_create(mesh_info);
+        acmesh_init_to(INIT_TYPE_RANDOM, main_mesh);
+        model_mesh = modelmesh_create(mesh_info);
+        acmesh_to_modelmesh(*main_mesh, model_mesh);
+    }
+
+    AcMeshInfo submesh_info = mesh_info;
+    submesh_info.int_params[AC_nz] /= num_processes;
+    update_config(&submesh_info);
+
+    AcMesh* submesh = acmesh_create(submesh_info);
+
+    /////////////////////
+    distribute_mesh(main_mesh, submesh);
+    gather_mesh(submesh, main_mesh);
+    /////////////////////////
+    // Autotest
+    bool is_acceptable = verify_meshes(*model_mesh, *main_mesh);
+    /////
+
+    acmesh_destroy(submesh);
+
+    if (process_id == 0) {
+        modelmesh_destroy(model_mesh);
+        acmesh_destroy(main_mesh);
+    }

    MPI_Finalize();
    return EXIT_SUCCESS;
--- a/src/standalone/CMakeLists.txt
+++ b/src/standalone/CMakeLists.txt
@@ -25,10 +25,11 @@ add_compile_options(-pipe ${OpenMP_CXX_FLAGS})
 add_compile_options(-Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion)# -Wshadow)

 ## Compile and link
-add_library(astaroth_standalone ${SOURCES})
+add_library(astaroth_standalone STATIC ${SOURCES})
+target_link_libraries(astaroth_standalone PRIVATE astaroth_core "${OpenMP_CXX_FLAGS}" ${SDL2_LIBRARY})

 add_executable(ac_run main.cc)
-target_link_libraries(ac_run PRIVATE astaroth_standalone astaroth_core "${OpenMP_CXX_FLAGS}" ${SDL2_LIBRARY})
+target_link_libraries(ac_run PRIVATE astaroth_standalone)

 # Define the config directory
 if (ALTER_CONF)
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -75,6 +75,12 @@ static const InitType test_cases[] = {INIT_TYPE_RANDOM, INIT_TYPE_XWAVE,
                                      INIT_TYPE_GAUSSIAN_RADIAL_EXPL, INIT_TYPE_ABC_FLOW};
 // #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))

+static inline bool
+is_valid(const ModelScalar a)
+{
+    return !isnan(a) && !isinf(a);
+}
+
 #if TEST_TYPE ==                                                                                   \
    QUICK_TEST // REGULAR TEST START HERE
               // --------------------------------------------------------------------------------------------------------------
--- a/src/standalone/config_loader.cc
+++ b/src/standalone/config_loader.cc
@@ -34,15 +34,6 @@
 #include "src/core/errchk.h"
 #include "src/core/math_utils.h"

-static inline void
-print(const AcMeshInfo& config)
-{
-    for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
-    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
-}
-
 /**
 \brief Find the index of the keyword in names
 \return Index in range 0...n if the keyword is in names. -1 if the keyword was
@@ -163,7 +154,7 @@ update_config(AcMeshInfo* config)
 #if VERBOSE_PRINTING // Defined in astaroth.h
    printf("###############################################################\n");
    printf("Config dimensions recalculated:\n");
-    print(*config);
+    acPrintMeshInfo(*config);
    printf("###############################################################\n");
 #endif
 }
--- a/src/standalone/model/host_forcing.cc
+++ b/src/standalone/model/host_forcing.cc
@@ -26,7 +26,9 @@
 */
 #include "host_forcing.h"

-#include "src/core/math_utils.h"
+// #include "src/core/math_utils.h"
+#include <cmath>
+using namespace std;

 // The is a wrapper for genering random numbers with a chosen system.
 AcReal
@@ -36,7 +38,7 @@ get_random_number_01()
    return AcReal(rand()) / AcReal(RAND_MAX);
 }

-AcReal3
+static AcReal3
 cross(const AcReal3& a, const AcReal3& b)
 {
    AcReal3 c;
@@ -48,13 +50,13 @@ cross(const AcReal3& a, const AcReal3& b)
    return c;
 }

-AcReal
+static AcReal
 dot(const AcReal3& a, const AcReal3& b)
 {
    return a.x * b.x + a.y * b.y + a.z * b.z;
 }

-AcReal3
+static AcReal3
 vec_norm(const AcReal3& a)
 {
    AcReal3 c;
@@ -67,7 +69,7 @@ vec_norm(const AcReal3& a)
    return c;
 }

-AcReal3
+static AcReal3
 vec_multi_scal(const AcReal scal, const AcReal3& a)
 {
    AcReal3 c;
--- a/src/standalone/model/host_forcing.h
+++ b/src/standalone/model/host_forcing.h
@@ -32,14 +32,6 @@

 AcReal get_random_number_01();

-AcReal3 cross(const AcReal3& a, const AcReal3& b);
-
-AcReal dot(const AcReal3& a, const AcReal3& b);
-
-AcReal3 vec_norm(const AcReal3& a);
-
-AcReal3 vec_multi_scal(const AcReal scal, const AcReal3& a);
-
 AcReal3 helical_forcing_k_generator(const AcReal kmax, const AcReal kmin);

 void helical_forcing_e_generator(AcReal3* e_force, const AcReal3 k_force);
--- a/src/standalone/model/model_rk3.cc
+++ b/src/standalone/model/model_rk3.cc
@@ -31,6 +31,16 @@
 #include "host_memory.h"
 #include "model_boundconds.h"

+// Standalone flags
+#define LDENSITY (1)
+#define LHYDRO (1)
+#define LMAGNETIC (1)
+#define LENTROPY (1)
+#define LTEMPERATURE (0)
+#define LFORCING (1)
+#define LUPWD (1)
+#define AC_THERMAL_CONDUCTIVITY (AcReal(0.001)) // TODO: make an actual config parameter
+
 typedef struct {
    ModelScalar x, y, z;
 } ModelVector;
--- a/src/standalone/model/modelmesh.h
+++ b/src/standalone/model/modelmesh.h
@@ -27,6 +27,7 @@
 #pragma once
 #include "astaroth.h"

+#include "math.h"

 typedef long double ModelScalar;