From c1f2a6c3401912698c638f62f4bb677f373cb74a Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 28 Oct 2020 12:55:32 +0200
Subject: [PATCH 01/12] Setup for benchmarks

---
 CMakeLists.txt                                |  10 +-
 samples/benchmark/main.cc                     |   2 +-
 .../genbenchmarkscripts/CMakeLists.txt        |   8 ++
 .../genbenchmarkscripts/main.c                | 120 ++++++++++++++++++
 samples/genbenchmarkscripts/main.c            |  72 +++++++----
 scripts/buildtestcases.sh                     |  56 ++++++++
 scripts/postprocess_benchmarks.sh             |  41 ++++++
 7 files changed, 279 insertions(+), 30 deletions(-)
 create mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt
 create mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/main.c
 create mode 100755 scripts/buildtestcases.sh
 create mode 100755 scripts/postprocess_benchmarks.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66b8001..8bfc25e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ project(astaroth C CXX CUDA)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 
 ## Project-wide compilation flags
-set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface
+set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_C_STANDARD 11)
@@ -38,10 +38,10 @@ endif()
 message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 
 ## Options
-option(DOUBLE_PRECISION "Generates double precision code."                    OFF)
+option(DOUBLE_PRECISION "Generates double precision code."                    ON)
 option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            ON)
-option(MPI_ENABLED      "Enables additional functions for MPI communciation." OFF)
-option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
+option(MPI_ENABLED      "Enables additional functions for MPI communciation." ON)
+option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
 option(VERBOSE          "Enables various status and warning messages"         OFF)
 
 ## Options (DEPRECATED)
@@ -110,7 +110,7 @@ if (BUILD_SAMPLES)
     add_subdirectory(samples/cpptest)
     add_subdirectory(samples/mpitest)
     add_subdirectory(samples/benchmark)
-    #add_subdirectory(samples/genbenchmarkscripts)
+    add_subdirectory(samples/genbenchmarkscripts)
     #add_subdirectory(samples/mpi_reduce_bench)
     add_subdirectory(samples/fortrantest)
 
diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index f205b04..3c5e504 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -154,7 +154,7 @@ main(int argc, char** argv)
     }*/
 
     // Percentiles
-    const size_t num_iters      = 1000;
+    const size_t num_iters      = 100;
     const double nth_percentile = 0.90;
     std::vector<double> results; // ms
     results.reserve(num_iters);
diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt b/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt
new file mode 100644
index 0000000..6115fde
--- /dev/null
+++ b/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(genbenchmarkscripts main.c)
+
+add_custom_command(
+  TARGET genbenchmarkscripts POST_BUILD
+  COMMAND genbenchmarkscripts
+  WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+  COMMENT "Generating benchmark scripts"
+)
diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/genbenchmarkscripts/main.c
new file mode 100644
index 0000000..d7b953b
--- /dev/null
+++ b/samples/genbenchmarkscripts/genbenchmarkscripts/main.c
@@ -0,0 +1,120 @@
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int
+main(void)
+{
+    const int max_nprocs = 64;
+    for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
+        char filename[4096];
+        sprintf(filename, "benchmark_%d.sh", nprocs);
+
+        FILE* fp = fopen(filename, "w");
+        assert(fp);
+
+        // Boilerplate
+        fprintf(fp, "#!/bin/bash\n");
+        fprintf(fp, "#BATCH --job-name=astaroth\n");        // OK
+        fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK
+        fprintf(fp, "#SBATCH --time=04:00:00\n");           // OK
+        fprintf(fp, "#SBATCH --mem=0\n");                   // OK
+        fprintf(fp, "#SBATCH --partition=gpu\n");           // OK
+        fprintf(fp, "#SBATCH --exclusive\n");               // OK
+        fprintf(fp, "#SBATCH --cpus-per-task=10\n");        // OK
+        fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
+        // HACK: exclude misconfigured nodes on Puhti
+        fprintf(fp, "#SBATCH -x "
+                    "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n");
+        // fprintf(fp, "#SBATCH --cpus-per-task=10\n");
+
+        // nprocs, nodes, gpus
+        const int max_gpus_per_node = 4;
+        const int gpus_per_node     = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
+        const int nodes             = (int)ceil((double)nprocs / max_gpus_per_node);
+        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK
+        fprintf(fp, "#SBATCH -n %d\n", nprocs);                     // OK
+        fprintf(fp, "#SBATCH -N %d\n", nodes);                      // OK
+        // fprintf(fp, "#SBATCH --exclusive\n");
+        // if (nprocs >= 4)
+        //    fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
+
+        // Modules
+        // OpenMPI
+        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
+        // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); //
+        // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
+        // if (nprocs >= 32)
+        //    fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); //
+        //    https://www.open-mpi.org/fa
+
+        // HPCX
+        // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
+
+        // Profile and run
+        // fprintf(fp, "mkdir -p profile_%d\n", nprocs);
+
+        /*
+        const int nx = 256; // max size 2048;
+        const int ny = nx;
+        const int nz = nx;
+
+        fprintf(fp,
+                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
+                //"%d\n",
+                "srun ./benchmark %d %d %d\n", nx, ny, nz);
+        */
+        // fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
+
+        const char* files[] = {
+            "benchmark_decomp_1D",       "benchmark_decomp_2D",      "benchmark_decomp_3D",
+            "benchmark_decomp_1D_comm",  "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
+            "benchmark_meshsize_256",    "benchmark_meshsize_512",   "benchmark_meshsize_1024",
+            "benchmark_meshsize_2048",   "benchmark_stencilord_2",   "benchmark_stencilord_4",
+            "benchmark_stencilord_6",    "benchmark_stencilord_8",   "benchmark_timings_control",
+            "benchmark_timings_comp",    "benchmark_timings_comm",   "benchmark_timings_default",
+            "benchmark_timings_corners", "benchmark_weak_128",       "benchmark_weak_256",
+            "benchmark_weak_512",
+        };
+        for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
+            int nn = 256;
+            if (strcmp(files[i], "benchmark_meshsize_512") == 0)
+                nn = 512;
+            else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
+                nn = 1024;
+            else if (strcmp(files[i], "benchmark_meshsize_2048") == 0)
+                nn = 2048;
+            else if (strcmp(files[i], "benchmark_weak_128") == 0)
+                nn = 128;
+            else if (strcmp(files[i], "benchmark_weak_512") == 0)
+                nn = 512;
+
+            // W/ Fredriks tunings
+            // (may cause Assertion `status == UCS_OK' failed errors)
+            // fprintf(fp,
+            //        "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
+            //        "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n",
+            //        files[i], nn, nn, nn);
+            if (nodes >= 2) {
+                fprintf(fp,
+                        "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
+                        "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm "
+                        "-f core.* && cd ..)\n",
+                        files[i], nn, nn, nn);
+            }
+            else {
+                fprintf(fp,
+                        "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* "
+                        "&& cd ..)\n",
+                        files[i], nn, nn, nn);
+            }
+        }
+
+        fclose(fp);
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c
index 7be0872..d7b953b 100644
--- a/samples/genbenchmarkscripts/main.c
+++ b/samples/genbenchmarkscripts/main.c
@@ -17,42 +17,48 @@ main(void)
 
         // Boilerplate
         fprintf(fp, "#!/bin/bash\n");
-        fprintf(fp, "#BATCH --job-name=astaroth\n");
-        fprintf(fp, "#SBATCH --account=project_2000403\n");
-        fprintf(fp, "#SBATCH --time=03:00:00\n");
-        fprintf(fp, "#SBATCH --mem=32000\n");
-        fprintf(fp, "#SBATCH --partition=gpu\n");
+        fprintf(fp, "#BATCH --job-name=astaroth\n");        // OK
+        fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK
+        fprintf(fp, "#SBATCH --time=04:00:00\n");           // OK
+        fprintf(fp, "#SBATCH --mem=0\n");                   // OK
+        fprintf(fp, "#SBATCH --partition=gpu\n");           // OK
+        fprintf(fp, "#SBATCH --exclusive\n");               // OK
+        fprintf(fp, "#SBATCH --cpus-per-task=10\n");        // OK
         fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
+        // HACK: exclude misconfigured nodes on Puhti
+        fprintf(fp, "#SBATCH -x "
+                    "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n");
         // fprintf(fp, "#SBATCH --cpus-per-task=10\n");
 
         // nprocs, nodes, gpus
         const int max_gpus_per_node = 4;
         const int gpus_per_node     = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
         const int nodes             = (int)ceil((double)nprocs / max_gpus_per_node);
-        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
-        fprintf(fp, "#SBATCH -n %d\n", nprocs);
-        fprintf(fp, "#SBATCH -N %d\n", nodes);
+        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK
+        fprintf(fp, "#SBATCH -n %d\n", nprocs);                     // OK
+        fprintf(fp, "#SBATCH -N %d\n", nodes);                      // OK
         // fprintf(fp, "#SBATCH --exclusive\n");
-        if (nprocs >= 4)
-            fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
+        // if (nprocs >= 4)
+        //    fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
 
         // Modules
         // OpenMPI
         fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
-        //fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
-        //fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
-        //if (nprocs >= 32)
-        //    fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
+        // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); //
+        // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
+        // if (nprocs >= 32)
+        //    fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); //
+        //    https://www.open-mpi.org/fa
 
         // HPCX
-        //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
-        //fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
+        // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
+        // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
 
         // Profile and run
         // fprintf(fp, "mkdir -p profile_%d\n", nprocs);
 
         /*
-        const int nx = 256; // max size 1792;
+        const int nx = 256; // max size 2048;
         const int ny = nx;
         const int nz = nx;
 
@@ -67,11 +73,11 @@ main(void)
             "benchmark_decomp_1D",       "benchmark_decomp_2D",      "benchmark_decomp_3D",
             "benchmark_decomp_1D_comm",  "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
             "benchmark_meshsize_256",    "benchmark_meshsize_512",   "benchmark_meshsize_1024",
-            "benchmark_meshsize_1792",   "benchmark_stencilord_2",   "benchmark_stencilord_4",
+            "benchmark_meshsize_2048",   "benchmark_stencilord_2",   "benchmark_stencilord_4",
             "benchmark_stencilord_6",    "benchmark_stencilord_8",   "benchmark_timings_control",
             "benchmark_timings_comp",    "benchmark_timings_comm",   "benchmark_timings_default",
             "benchmark_timings_corners", "benchmark_weak_128",       "benchmark_weak_256",
-            "benchmark_weak_448",
+            "benchmark_weak_512",
         };
         for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
             int nn = 256;
@@ -79,14 +85,32 @@ main(void)
                 nn = 512;
             else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
                 nn = 1024;
-            else if (strcmp(files[i], "benchmark_meshsize_1792") == 0)
-                nn = 1792;
+            else if (strcmp(files[i], "benchmark_meshsize_2048") == 0)
+                nn = 2048;
             else if (strcmp(files[i], "benchmark_weak_128") == 0)
                 nn = 128;
-            else if (strcmp(files[i], "benchmark_weak_448") == 0)
-                nn = 448;
+            else if (strcmp(files[i], "benchmark_weak_512") == 0)
+                nn = 512;
 
-            fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn);
+            // W/ Fredriks tunings
+            // (may cause Assertion `status == UCS_OK' failed errors)
+            // fprintf(fp,
+            //        "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
+            //        "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n",
+            //        files[i], nn, nn, nn);
+            if (nodes >= 2) {
+                fprintf(fp,
+                        "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
+                        "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm "
+                        "-f core.* && cd ..)\n",
+                        files[i], nn, nn, nn);
+            }
+            else {
+                fprintf(fp,
+                        "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* "
+                        "&& cd ..)\n",
+                        files[i], nn, nn, nn);
+            }
         }
 
         fclose(fp);
diff --git a/scripts/buildtestcases.sh b/scripts/buildtestcases.sh
new file mode 100755
index 0000000..7157656
--- /dev/null
+++ b/scripts/buildtestcases.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Modules (!!!)
+module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl
+#module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl
+#export UCX_MEMTYPE_CACHE=n #  Workaround for bug in hpcx-mpi/2.5.0
+
+load_default_case() {
+  # Pinned or RDMA
+  sed -i 's/#define MPI_USE_PINNED ([0-9]*)/#define MPI_USE_PINNED (0)/' src/core/device.cc
+
+  # Stencil order
+  sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' acc/stdlib/stdderiv.h
+  sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' include/astaroth.h
+
+  # Timings
+  sed -i 's/MPI_COMPUTE_ENABLED (.)/MPI_COMPUTE_ENABLED (1)/' src/core/device.cc
+  sed -i 's/MPI_COMM_ENABLED (.)/MPI_COMM_ENABLED (1)/' src/core/device.cc
+  sed -i 's/MPI_INCL_CORNERS (.)/MPI_INCL_CORNERS (0)/' src/core/device.cc
+
+  # Decomposition
+  sed -i 's/MPI_DECOMPOSITION_AXES (.)/MPI_DECOMPOSITION_AXES (3)/' src/core/device.cc
+
+  # Strong/Weak
+  sed -i 's/const TestType test = .*;/const TestType test = TEST_STRONG_SCALING;/' samples/benchmark/main.cc
+
+  # Num iters
+  sed -i 's/const size_t num_iters      = .*;/const size_t num_iters      = 1000;/' samples/benchmark/main.cc
+}
+
+# $1 test name
+# $2 grid size
+create_case() {
+  DIR="benchmark_$1"
+  mkdir -p $DIR
+  cd $DIR
+  /users/pekkila/cmake/build/bin/cmake .. && make -j
+  cd ..
+}
+
+# Mesh size
+load_default_case
+create_case "meshsize_256"
+sed -i 's/const size_t num_iters      = .*;/const size_t num_iters      = 100;/' samples/benchmark/main.cc
+create_case "meshsize_512"
+create_case "meshsize_1024"
+create_case "meshsize_2048"
+
+# Run batch jobs
+sbatch benchmark_meshsize_256/benchmark_1.sh
+sbatch benchmark_meshsize_256/benchmark_2.sh
+sbatch benchmark_meshsize_256/benchmark_4.sh
+sbatch benchmark_meshsize_256/benchmark_8.sh
+sbatch benchmark_meshsize_256/benchmark_16.sh
+sbatch benchmark_meshsize_256/benchmark_32.sh
+sbatch benchmark_meshsize_256/benchmark_64.sh
diff --git a/scripts/postprocess_benchmarks.sh b/scripts/postprocess_benchmarks.sh
new file mode 100755
index 0000000..7a60884
--- /dev/null
+++ b/scripts/postprocess_benchmarks.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+OUTPUT=results.csv
+rm -i $OUTPUT
+
+# $1 input dir
+process_input() {
+    echo $1
+    #cat $1/*.csv | sort -n
+    cat $1/*.csv | sort -k1n -k3n | awk '!a[$1]++'
+    echo ""
+} >> $OUTPUT
+
+process_input "benchmark_decomp_1D"
+process_input "benchmark_decomp_2D"
+process_input "benchmark_decomp_3D"
+process_input "benchmark_decomp_1D_comm"
+process_input "benchmark_decomp_2D_comm"
+process_input "benchmark_decomp_3D_comm"
+
+process_input "benchmark_meshsize_256"
+process_input "benchmark_meshsize_512"
+process_input "benchmark_meshsize_1024"
+process_input "benchmark_meshsize_2048"
+
+process_input "benchmark_stencilord_2"
+process_input "benchmark_stencilord_4"
+process_input "benchmark_stencilord_6"
+process_input "benchmark_stencilord_8"
+
+process_input "benchmark_timings_control"
+process_input "benchmark_timings_comp"
+process_input "benchmark_timings_comm"
+process_input "benchmark_timings_default"
+process_input "benchmark_timings_corners"
+
+process_input "benchmark_weak_128"
+process_input "benchmark_weak_256"
+process_input "benchmark_weak_512"
+
+cat $OUTPUT

From 0a2827593c3479f1b117476663da70f6e6263972 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 28 Oct 2020 12:56:34 +0200
Subject: [PATCH 02/12] Added very experimental implementation for mixed
 precision. Comm is done with f32 and comp with f64.

---
 src/core/device.cc         | 11 ++++++++---
 src/core/kernels/kernels.h |  6 ++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/core/device.cc b/src/core/device.cc
index a6ec793..0ee987f 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -14,7 +14,7 @@
 #define MPI_DECOMPOSITION_AXES (3)
 #define MPI_COMPUTE_ENABLED (1)
 #define MPI_COMM_ENABLED (1)
-#define MPI_INCL_CORNERS (1)
+#define MPI_INCL_CORNERS (0)
 #define MPI_USE_PINNED (0)              // Do inter-node comm with pinned memory
 #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
 
@@ -721,7 +721,7 @@ acCreatePackedDataHost(const int3 dims)
     data.dims = dims;
 
     const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
-    data.data          = (AcReal*)malloc(bytes);
+    data.data          = (AcRealPacked*)malloc(bytes);
     ERRCHK_ALWAYS(data.data);
 
     return data;
@@ -1132,8 +1132,13 @@ acTransferCommData(const Device device, //
     cudaSetDevice(device->id);
 
     MPI_Datatype datatype = MPI_FLOAT;
-    if (sizeof(AcReal) == 8)
+    if (sizeof(data->srcs[0].data[0]) == 2) {
+        datatype = MPI_SHORT; // TODO CONFIRM THAT IS CORRECTLY CAST TO HALF
+    } else if (sizeof(data->srcs[0].data[0]) == 4) {
+        datatype = MPI_FLOAT;
+    } else {
         datatype = MPI_DOUBLE;
+    }
 
     int nprocs, pid;
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index fc9c745..513c5e4 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -8,11 +8,13 @@
 #define MPI_GPUDIRECT_DISABLED (0)
 #endif // AC_MPI_ENABLED
 
+typedef float AcRealPacked;
+
 typedef struct {
     int3 dims;
-    AcReal* data;
+    AcRealPacked* data;
 
-    AcReal* data_pinned;
+    AcRealPacked* data_pinned;
     bool pinned = false; // Set if data was received to pinned memory
 } PackedData;
 

From ae0d4de23c3ddc01d0439f419a0150204550f1c3 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 29 Oct 2020 16:33:41 +0200
Subject: [PATCH 03/12] The root host mesh is no longer allocated during
 benchmarking as this caused out-of-memory errors in weak scaling tests

---
 include/astaroth.h        |  3 +++
 samples/benchmark/main.cc |  7 ++++++-
 src/core/CMakeLists.txt   |  2 +-
 src/core/device.cc        | 16 ++++++++++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index d32b367..039ab82 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -284,6 +284,9 @@ Resets all devices on the current grid.
  */
 AcResult acGridQuit(void);
 
+/** Randomizes the local mesh */
+AcResult acGridRandomize(void);
+
 /** */
 AcResult acGridSynchronizeStream(const Stream stream);
 
diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc
index 3c5e504..76223bb 100644
--- a/samples/benchmark/main.cc
+++ b/samples/benchmark/main.cc
@@ -107,7 +107,7 @@ main(int argc, char** argv)
         }
     }
 
-    const TestType test = TEST_STRONG_SCALING;
+    const TestType test = TEST_WEAK_SCALING;
     if (test == TEST_WEAK_SCALING) {
         uint3_64 decomp = decompose(nprocs);
         info.int_params[AC_nx] *= decomp.x;
@@ -126,10 +126,15 @@ main(int argc, char** argv)
 
     // GPU alloc & compute
     acGridInit(info);
+    acGridRandomize();
+
+    /*
     AcMesh model;
     acMeshCreate(info, &model);
     acMeshRandomize(&model);
     acGridLoadMesh(STREAM_DEFAULT, model);
+    */
+
     /*
     acGridLoadMesh(STREAM_DEFAULT, model);
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index ca96366..15cd4fa 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,7 +2,7 @@ find_package(CUDAToolkit)
 
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc)
-target_link_libraries(astaroth_core astaroth_kernels CUDA::cudart CUDA::cuda_driver)
+target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver)
 
 ## Options
 if (MPI_ENABLED)
diff --git a/src/core/device.cc b/src/core/device.cc
index 0ee987f..4af17fc 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1242,6 +1242,22 @@ acGridSynchronizeStream(const Stream stream)
     return AC_SUCCESS;
 }
 
+
+#include "astaroth_utils.h" // HACK TO RANDOMIZE
+AcResult
+acGridRandomize(void)
+{
+    ERRCHK(grid.initialized);
+
+    AcMesh host;
+    acMeshCreate(grid.submesh.info, &host);
+    acMeshRandomize(&host);
+    acDeviceLoadMesh(grid.device, STREAM_DEFAULT, host);
+    acMeshDestroy(&host);
+
+    return AC_SUCCESS;
+}
+
 AcResult
 acGridInit(const AcMeshInfo info)
 {

From bf7eb83084f3e13b590de1da344ac4fa238d7350 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Thu, 29 Oct 2020 16:34:48 +0200
Subject: [PATCH 04/12] Updated benchmark script

---
 scripts/buildtestcases.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scripts/buildtestcases.sh b/scripts/buildtestcases.sh
index 7157656..89a6b02 100755
--- a/scripts/buildtestcases.sh
+++ b/scripts/buildtestcases.sh
@@ -46,6 +46,14 @@ create_case "meshsize_512"
 create_case "meshsize_1024"
 create_case "meshsize_2048"
 
+# Weak scaling
+load_default_case
+sed -i 's/const TestType test = .*;/const TestType test = TEST_WEAK_SCALING;/' samples/benchmark/main.cc
+create_case "weak_128"
+create_case "weak_256"
+sed -i 's/const size_t num_iters      = .*;/const size_t num_iters      = 100;/' samples/benchmark/main.cc
+create_case "weak_512"
+
 # Run batch jobs
 sbatch benchmark_meshsize_256/benchmark_1.sh
 sbatch benchmark_meshsize_256/benchmark_2.sh

From 00b7b537cea6f3c1a3f8c9c433f352156a37daac Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Mon, 2 Nov 2020 10:58:18 +0200
Subject: [PATCH 05/12] Modifications for master merge: reverted CMakeLists.txt
 to the original, disabled mixed precision by default

---
 CMakeLists.txt             | 10 +++++-----
 src/core/kernels/kernels.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8bfc25e..66b8001 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ project(astaroth C CXX CUDA)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 
 ## Project-wide compilation flags
-set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface
+set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
 set(CMAKE_C_STANDARD 11)
@@ -38,10 +38,10 @@ endif()
 message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 
 ## Options
-option(DOUBLE_PRECISION "Generates double precision code."                    ON)
+option(DOUBLE_PRECISION "Generates double precision code."                    OFF)
 option(BUILD_SAMPLES    "Builds projects in samples subdirectory."            ON)
-option(MPI_ENABLED      "Enables additional functions for MPI communciation." ON)
-option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
+option(MPI_ENABLED      "Enables additional functions for MPI communciation." OFF)
+option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
 option(VERBOSE          "Enables various status and warning messages"         OFF)
 
 ## Options (DEPRECATED)
@@ -110,7 +110,7 @@ if (BUILD_SAMPLES)
     add_subdirectory(samples/cpptest)
     add_subdirectory(samples/mpitest)
     add_subdirectory(samples/benchmark)
-    add_subdirectory(samples/genbenchmarkscripts)
+    #add_subdirectory(samples/genbenchmarkscripts)
     #add_subdirectory(samples/mpi_reduce_bench)
     add_subdirectory(samples/fortrantest)
 
diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 513c5e4..282f1c4 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -8,7 +8,7 @@
 #define MPI_GPUDIRECT_DISABLED (0)
 #endif // AC_MPI_ENABLED
 
-typedef float AcRealPacked;
+typedef AcReal AcRealPacked;
 
 typedef struct {
     int3 dims;

From d48a478254c1917f17841055aca8cc5a91157fb1 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Mon, 2 Nov 2020 16:39:01 +0200
Subject: [PATCH 06/12] Removed duplicate genbenchmarkscripts

---
 .../genbenchmarkscripts/CMakeLists.txt        |   8 --
 .../genbenchmarkscripts/main.c                | 120 ------------------
 2 files changed, 128 deletions(-)
 delete mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt
 delete mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/main.c

diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt b/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt
deleted file mode 100644
index 6115fde..0000000
--- a/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-add_executable(genbenchmarkscripts main.c)
-
-add_custom_command(
-  TARGET genbenchmarkscripts POST_BUILD
-  COMMAND genbenchmarkscripts
-  WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
-  COMMENT "Generating benchmark scripts"
-)
diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/genbenchmarkscripts/main.c
deleted file mode 100644
index d7b953b..0000000
--- a/samples/genbenchmarkscripts/genbenchmarkscripts/main.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-int
-main(void)
-{
-    const int max_nprocs = 64;
-    for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
-        char filename[4096];
-        sprintf(filename, "benchmark_%d.sh", nprocs);
-
-        FILE* fp = fopen(filename, "w");
-        assert(fp);
-
-        // Boilerplate
-        fprintf(fp, "#!/bin/bash\n");
-        fprintf(fp, "#BATCH --job-name=astaroth\n");        // OK
-        fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK
-        fprintf(fp, "#SBATCH --time=04:00:00\n");           // OK
-        fprintf(fp, "#SBATCH --mem=0\n");                   // OK
-        fprintf(fp, "#SBATCH --partition=gpu\n");           // OK
-        fprintf(fp, "#SBATCH --exclusive\n");               // OK
-        fprintf(fp, "#SBATCH --cpus-per-task=10\n");        // OK
-        fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
-        // HACK: exclude misconfigured nodes on Puhti
-        fprintf(fp, "#SBATCH -x "
-                    "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n");
-        // fprintf(fp, "#SBATCH --cpus-per-task=10\n");
-
-        // nprocs, nodes, gpus
-        const int max_gpus_per_node = 4;
-        const int gpus_per_node     = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
-        const int nodes             = (int)ceil((double)nprocs / max_gpus_per_node);
-        fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK
-        fprintf(fp, "#SBATCH -n %d\n", nprocs);                     // OK
-        fprintf(fp, "#SBATCH -N %d\n", nodes);                      // OK
-        // fprintf(fp, "#SBATCH --exclusive\n");
-        // if (nprocs >= 4)
-        //    fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
-
-        // Modules
-        // OpenMPI
-        fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
-        // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); //
-        // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
-        // if (nprocs >= 32)
-        //    fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); //
-        //    https://www.open-mpi.org/fa
-
-        // HPCX
-        // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
-        // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
-
-        // Profile and run
-        // fprintf(fp, "mkdir -p profile_%d\n", nprocs);
-
-        /*
-        const int nx = 256; // max size 2048;
-        const int ny = nx;
-        const int nz = nx;
-
-        fprintf(fp,
-                //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
-                //"%d\n",
-                "srun ./benchmark %d %d %d\n", nx, ny, nz);
-        */
-        // fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
-
-        const char* files[] = {
-            "benchmark_decomp_1D",       "benchmark_decomp_2D",      "benchmark_decomp_3D",
-            "benchmark_decomp_1D_comm",  "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
-            "benchmark_meshsize_256",    "benchmark_meshsize_512",   "benchmark_meshsize_1024",
-            "benchmark_meshsize_2048",   "benchmark_stencilord_2",   "benchmark_stencilord_4",
-            "benchmark_stencilord_6",    "benchmark_stencilord_8",   "benchmark_timings_control",
-            "benchmark_timings_comp",    "benchmark_timings_comm",   "benchmark_timings_default",
-            "benchmark_timings_corners", "benchmark_weak_128",       "benchmark_weak_256",
-            "benchmark_weak_512",
-        };
-        for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
-            int nn = 256;
-            if (strcmp(files[i], "benchmark_meshsize_512") == 0)
-                nn = 512;
-            else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
-                nn = 1024;
-            else if (strcmp(files[i], "benchmark_meshsize_2048") == 0)
-                nn = 2048;
-            else if (strcmp(files[i], "benchmark_weak_128") == 0)
-                nn = 128;
-            else if (strcmp(files[i], "benchmark_weak_512") == 0)
-                nn = 512;
-
-            // W/ Fredriks tunings
-            // (may cause Assertion `status == UCS_OK' failed errors)
-            // fprintf(fp,
-            //        "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
-            //        "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n",
-            //        files[i], nn, nn, nn);
-            if (nodes >= 2) {
-                fprintf(fp,
-                        "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
-                        "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm "
-                        "-f core.* && cd ..)\n",
-                        files[i], nn, nn, nn);
-            }
-            else {
-                fprintf(fp,
-                        "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* "
-                        "&& cd ..)\n",
-                        files[i], nn, nn, nn);
-            }
-        }
-
-        fclose(fp);
-    }
-
-    return EXIT_SUCCESS;
-}

From 349093768d8ab0af52d38341e7ab08ba8bf0f937 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Mon, 2 Nov 2020 17:14:26 +0200
Subject: [PATCH 07/12] Added acMeshRandomize to astaroth.h to keep core and
 utils separate

---
 include/astaroth.h       |  3 +++
 include/astaroth_utils.h |  3 ---
 src/core/CMakeLists.txt  |  2 +-
 src/core/astaroth.cc     | 17 +++++++++++++++++
 src/core/device.cc       |  2 --
 src/utils/memory.c       | 17 -----------------
 6 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index 039ab82..21e45b9 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -590,6 +590,9 @@ AcResult acUpdateBuiltinParams(AcMeshInfo* config);
 /** Creates a mesh stored in host memory */
 AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
 
+/** Randomizes a host mesh */
+AcResult acMeshRandomize(AcMesh* mesh);
+
 /** Destroys a mesh stored in host memory */
 AcResult acMeshDestroy(AcMesh* mesh);
 
diff --git a/include/astaroth_utils.h b/include/astaroth_utils.h
index f742f73..4289c1b 100644
--- a/include/astaroth_utils.h
+++ b/include/astaroth_utils.h
@@ -50,9 +50,6 @@ AcResult acVertexBufferSet(const VertexBufferHandle handle, const AcReal value,
 /** */
 AcResult acMeshSet(const AcReal value, AcMesh* mesh);
 
-/** */
-AcResult acMeshRandomize(AcMesh* mesh);
-
 /** */
 AcResult acMeshApplyPeriodicBounds(AcMesh* mesh);
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 15cd4fa..ca96366 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -2,7 +2,7 @@ find_package(CUDAToolkit)
 
 ## Astaroth Core
 add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc)
-target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver)
+target_link_libraries(astaroth_core astaroth_kernels CUDA::cudart CUDA::cuda_driver)
 
 ## Options
 if (MPI_ENABLED)
diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc
index 1c8d0f7..c6d9a19 100644
--- a/src/core/astaroth.cc
+++ b/src/core/astaroth.cc
@@ -221,6 +221,23 @@ acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
     return AC_SUCCESS;
 }
 
+static AcReal
+randf(void)
+{
+    return (AcReal)rand() / (AcReal)RAND_MAX;
+}
+
+AcResult
+acMeshRandomize(AcMesh* mesh)
+{
+    const int n = acVertexBufferSize(mesh->info);
+    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
+        for (int i = 0; i < n; ++i)
+            mesh->vertex_buffer[w][i] = randf();
+
+    return AC_SUCCESS;
+}
+
 AcResult
 acMeshDestroy(AcMesh* mesh)
 {
diff --git a/src/core/device.cc b/src/core/device.cc
index 4af17fc..24d6e38 100644
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -1242,8 +1242,6 @@ acGridSynchronizeStream(const Stream stream)
     return AC_SUCCESS;
 }
 
-
-#include "astaroth_utils.h" // HACK TO RANDOMIZE
 AcResult
 acGridRandomize(void)
 {
diff --git a/src/utils/memory.c b/src/utils/memory.c
index 334d106..f52fa12 100644
--- a/src/utils/memory.c
+++ b/src/utils/memory.c
@@ -38,23 +38,6 @@ acMeshSet(const AcReal value, AcMesh* mesh)
     return AC_SUCCESS;
 }
 
-static AcReal
-randf(void)
-{
-    return (AcReal)rand() / (AcReal)RAND_MAX;
-}
-
-AcResult
-acMeshRandomize(AcMesh* mesh)
-{
-    const int n = acVertexBufferSize(mesh->info);
-    for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
-        for (int i = 0; i < n; ++i)
-            mesh->vertex_buffer[w][i] = randf();
-
-    return AC_SUCCESS;
-}
-
 AcResult
 acMeshApplyPeriodicBounds(AcMesh* mesh)
 {

From dff560561ecd9ed4e553aeaab57d57486c22b47d Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Tue, 3 Nov 2020 10:28:58 +0200
Subject: [PATCH 08/12] Explicit casting during error checking

---
 src/utils/verification.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utils/verification.c b/src/utils/verification.c
index e57498c..3f1bdf6 100644
--- a/src/utils/verification.c
+++ b/src/utils/verification.c
@@ -53,10 +53,10 @@ acGetError(const AcReal model, const AcReal candidate)
         const long double e = floorl(logl(fabsl(error.model)) / logl(2));
 
         const long double ulp             = powl(base, e - (p - 1));
-        const long double machine_epsilon = 0.5 * powl(base, -(p - 1));
-        error.abs_error                   = fabsl(model - candidate);
+        const long double machine_epsilon = 0.5l * powl(base, -(p - 1));
+        error.abs_error                   = fabsl((long double)model - (long double)candidate);
         error.ulp_error                   = error.abs_error / ulp;
-        error.rel_error                   = fabsl(1.0l - candidate / model) / machine_epsilon;
+        error.rel_error                   = fabsl(1.0l - (long double)candidate / (long double)model) / machine_epsilon;
     }
 
     error.maximum_magnitude = error.minimum_magnitude = 0;

From f61223c02b21bd417189081ee97e7f88ea85a22a Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Wed, 11 Nov 2020 13:18:29 +0200
Subject: [PATCH 09/12] The number of default streams is now 32

---
 acc/src/code_generator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c
index 017591a..b1de527 100644
--- a/acc/src/code_generator.c
+++ b/acc/src/code_generator.c
@@ -716,7 +716,7 @@ external acdevicesynchronizestream
     fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter);
 
     // Streams
-    const size_t nstreams = 20;
+    const size_t nstreams = 32;
     for (size_t i = 0; i < nstreams; ++i) {
         fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i);
         fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i);

From a463fd492f64c8386cc9ab4aa032a6dad25c3936 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Thu, 19 Nov 2020 14:31:10 +0800
Subject: [PATCH 10/12] Synched simulation.cc with existing work.

---
 samples/standalone/simulation.cc | 33 ++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/samples/standalone/simulation.cc b/samples/standalone/simulation.cc
index 0bf193e..178ff17 100644
--- a/samples/standalone/simulation.cc
+++ b/samples/standalone/simulation.cc
@@ -43,7 +43,13 @@
 
 // NEED TO BE DEFINED HERE. IS NOT NOTICED BY compile_acc call.
 #define LFORCING (0)
+
+#ifdef VTXBUF_ACCRETION
+#define LSINK (1)
+#else
 #define LSINK (0)
+#endif
+
 #ifdef BFIELDX
 #define LBFIELD (1)
 #else
@@ -322,6 +328,7 @@ run_simulation(const char* config_path)
     // acmesh_init_to(INIT_TYPE_SIMPLE_CORE, mesh); //Initial condition for a collapse test
 
 #if LSINK
+    printf("WARNING! Sink particle is under development. USE AT YOUR OWN RISK!")
     vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh);
 #endif
 
@@ -387,18 +394,10 @@ run_simulation(const char* config_path)
     /* Step the simulation */
     AcReal accreted_mass = 0.0;
     AcReal sink_mass     = 0.0;
+    AcReal uu_freefall = 0.0;
     AcReal dt_typical    = 0.0;
     int dtcounter = 0;
     for (int i = start_step + 1; i < max_steps; ++i) {
-        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
-#if LBFIELD
-        const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO);
-        const AcReal uref  = max(umax, vAmax); 
-        const AcReal dt   = host_timestep(uref, vAmax, mesh_info);
-#else
-        const AcReal dt   = host_timestep(umax, 0.0l, mesh_info);
-#endif
-
 #if LSINK
 
         const AcReal sum_mass = acReduceScal(RTYPE_SUM, VTXBUF_ACCRETION);
@@ -406,7 +405,7 @@ run_simulation(const char* config_path)
         sink_mass             = 0.0;
         sink_mass             = mesh_info.real_params[AC_M_sink_init] + accreted_mass;
         acLoadDeviceConstant(AC_M_sink, sink_mass);
-        vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh);
+        vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); //TODO THIS IS A BUG! WILL ONLY SET HOST BUFFER 0! 
 
         int on_off_switch;
         if (i < 1) {
@@ -416,11 +415,25 @@ run_simulation(const char* config_path)
             on_off_switch = 1;
         }
         acLoadDeviceConstant(AC_switch_accretion, on_off_switch);
+
+        //Adjust courant condition for free fall velocity
+        const AcReal RR = mesh_info.real_params[AC_soft]*mesh_info.real_params[AC_soft];
+        const AcReal SQ2GM = sqrt(AcReal(2.0)*mesh_info.real_params[AC_G_const]*sink_mass);
+        uu_freefall = fabs(SQ2GM / sqrt(RR));
 #else
         accreted_mass = -1.0;
         sink_mass     = -1.0;
 #endif
 
+        const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
+#if LBFIELD
+        const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO);
+        const AcReal uref  = max(max(umax,uu_freefall), vAmax); 
+        const AcReal dt   = host_timestep(uref, vAmax, mesh_info);
+#else
+        const AcReal dt   = host_timestep(umax, 0.0l, mesh_info);
+#endif
+
 #if LFORCING
         const ForcingParams forcing_params = generateForcingParams(mesh_info);
         loadForcingParamsToDevice(forcing_params);

From 204f0753437c9e91bf1ab4fe8c618a421788cbe1 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Thu, 19 Nov 2020 15:14:28 +0800
Subject: [PATCH 11/12] AC_unit_magnetic in dsl

---
 acc/mhd_solver/stencil_kernel.ac | 1 +
 1 file changed, 1 insertion(+)

diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac
index e0efa94..db83a3f 100644
--- a/acc/mhd_solver/stencil_kernel.ac
+++ b/acc/mhd_solver/stencil_kernel.ac
@@ -39,6 +39,7 @@ uniform Scalar AC_zorig;
 uniform Scalar AC_unit_density;
 uniform Scalar AC_unit_velocity;
 uniform Scalar AC_unit_length;
+uniform Scalar AC_unit_magnetic;
 // properties of gravitating star
 uniform Scalar AC_star_pos_x;
 uniform Scalar AC_star_pos_y;

From e3eb7822132465d5c06a0956d4ef6c6bf8428359 Mon Sep 17 00:00:00 2001
From: Miikka Vaisala <mvaisala@asiaa.sinica.edu.tw>
Date: Fri, 20 Nov 2020 11:11:54 +0800
Subject: [PATCH 12/12] Sorry for the compilation problem. Corrected.

---
 samples/standalone/simulation.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/standalone/simulation.cc b/samples/standalone/simulation.cc
index 178ff17..4e0e176 100644
--- a/samples/standalone/simulation.cc
+++ b/samples/standalone/simulation.cc
@@ -431,7 +431,8 @@ run_simulation(const char* config_path)
         const AcReal uref  = max(max(umax,uu_freefall), vAmax); 
         const AcReal dt   = host_timestep(uref, vAmax, mesh_info);
 #else
-        const AcReal dt   = host_timestep(umax, 0.0l, mesh_info);
+        const AcReal uref  = max(umax,uu_freefall); 
+        const AcReal dt   = host_timestep(uref, 0.0l, mesh_info);
 #endif
 
 #if LFORCING