From abe4dfb4fe00d4892af73761a6551eab81d3a6aa Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 29 Jul 2019 15:21:15 +0300
Subject: [PATCH 1/4] ac_mkbuilddir.sh did not stop if the directory specified
 by the user did not exist. This lead to messing up the base astaroth
 directory with temporary cmake files. Added -p flag to mkdir to create parent
 directories if necessary to avoid this

---
 scripts/ac_mkbuilddir.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ac_mkbuilddir.sh b/scripts/ac_mkbuilddir.sh
index eac417e..2874168 100755
--- a/scripts/ac_mkbuilddir.sh
+++ b/scripts/ac_mkbuilddir.sh
@@ -61,7 +61,7 @@ done
 
 echo "Creating build directory: ${BUILD_DIR}"
 
-mkdir ${BUILD_DIR}
+mkdir -p ${BUILD_DIR}
 
 cd ${BUILD_DIR}
 

From 9796d5e981431dbb8a017f1f8654f0dd31f232a9 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Mon, 29 Jul 2019 15:35:17 +0300
Subject: [PATCH 2/4] The previous commit to ac_mkbuilddir.sh was not enough.
 Added a line that makes the script to stop if any of the commands fail to
 avoid cluttering the base astaroth directory. In my case the issue was
 permission denied when trying to create a project directory in /MYSCRATCH
 (system root directory) instead of MYSCRATCH (astaroth/MYSCRATCH)

---
 scripts/ac_mkbuilddir.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/ac_mkbuilddir.sh b/scripts/ac_mkbuilddir.sh
index 2874168..da5bc1a 100755
--- a/scripts/ac_mkbuilddir.sh
+++ b/scripts/ac_mkbuilddir.sh
@@ -5,6 +5,8 @@ then
        exit 1
 fi
 
+# Exit if any of the following commands fail
+set -e
 
 TIARA_SETUP_DEFAULT=""
 DOUBLE_DEFAULT="OFF"

From fdc1e7333c621a787a4a090b109cd7ec15c55991 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@protonmail.com>
Date: Tue, 30 Jul 2019 09:10:06 +0000
Subject: [PATCH 3/4] Added macros for getting int3 and AcReal3 device
 constants from within kernels (and DSL).

---
 src/core/device.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/core/device.cu b/src/core/device.cu
index 7b624e3..005d02f 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -43,7 +43,9 @@ __constant__ AcMeshInfo d_mesh_info;
 __constant__ int3 d_multigpu_offset;
 __constant__ Grid globalGrid;
 #define DCONST_INT(X) (d_mesh_info.int_params[X])
+#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
 #include "kernels/kernels.cuh"

From 69deef66fe5c9285b9b19a1e7fa56dcd8e87cf40 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 30 Jul 2019 14:28:18 +0300
Subject: [PATCH 4/4] Added sum reduction. NOTE: Scalar sum does not pass the
 automated test but vector sum does. I couldn't see anything wrong with the
 code itself and I strongly suspect that the failures are caused by loss of
 precision due to summing a huge amount of numbers of different magnitudes.
 However I'm not yet completely sure. Something like the Kahan summation
 algorithm might be useful if the errors are really caused by fp arithmetic.

---
 include/astaroth_defines.h           |  9 +++-
 src/core/astaroth.cu                 |  3 +-
 src/core/kernels/kernels.cuh         |  8 ++++
 src/standalone/model/model_reduce.cc | 66 ++++++++++++++--------------
 4 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index ac9804b..240ed86 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -77,7 +77,14 @@ typedef struct {
 
 typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
 
-typedef enum { RTYPE_MAX, RTYPE_MIN, RTYPE_RMS, RTYPE_RMS_EXP, NUM_REDUCTION_TYPES } ReductionType;
+typedef enum {
+    RTYPE_MAX,
+    RTYPE_MIN,
+    RTYPE_RMS,
+    RTYPE_RMS_EXP,
+    RTYPE_SUM,
+    NUM_REDUCTION_TYPES
+} ReductionType;
 
 typedef enum {
     STREAM_DEFAULT,
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index fadf4c0..9abc010 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -474,7 +474,7 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
         else if (rtype == RTYPE_MIN) {
             res = min(res, results[i]);
         }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
             res = sum(res, results[i]);
         }
         else {
@@ -486,7 +486,6 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
         const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
         res                = sqrt(inv_n * res);
     }
-
     return res;
 }
 
diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh
index 8977d07..52b48f6 100644
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -893,6 +893,10 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st
         kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
         kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
         kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
     } else {
         ERROR("Unrecognized rtype");
     }
@@ -944,6 +948,10 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta
         kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
         kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
         kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_SUM) {
+        kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
     } else {
         ERROR("Unrecognized rtype");
     }
diff --git a/src/standalone/model/model_reduce.cc b/src/standalone/model/model_reduce.cc
index 6d48c4b..ae092c9 100644
--- a/src/standalone/model/model_reduce.cc
+++ b/src/standalone/model/model_reduce.cc
@@ -68,8 +68,7 @@ exp_squared(const ModelScalar& a, const ModelScalar& b, const ModelScalar& c) {
 // clang-format on
 
 ModelScalar
-model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
-                  const VertexBufferHandle& a)
+model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype, const VertexBufferHandle& a)
 {
     ReduceInitialScalFunc reduce_initial;
     ReduceFunc reduce;
@@ -95,30 +94,31 @@ model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
         reduce         = sum;
         solve_mean     = true;
         break;
+    case RTYPE_SUM:
+        reduce_initial = length;
+        reduce         = sum;
+        break;
     default:
         ERROR("Unrecognized RTYPE");
     }
 
-    const int initial_idx = acVertexBufferIdx(
-        mesh.info.int_params[AC_nx_min], mesh.info.int_params[AC_ny_min],
-        mesh.info.int_params[AC_nz_min], mesh.info);
+    const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
+                                              mesh.info.int_params[AC_ny_min],
+                                              mesh.info.int_params[AC_nz_min], mesh.info);
 
     ModelScalar res;
     if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
         res = reduce_initial(mesh.vertex_buffer[a][initial_idx]);
     else
-        res = .0f;
+        res = 0;
 
-    for (int k = mesh.info.int_params[AC_nz_min];
-         k < mesh.info.int_params[AC_nz_max]; ++k) {
-        for (int j = mesh.info.int_params[AC_ny_min];
-             j < mesh.info.int_params[AC_ny_max]; ++j) {
-            for (int i = mesh.info.int_params[AC_nx_min];
-                 i < mesh.info.int_params[AC_nx_max]; ++i) {
+    for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; ++k) {
+        for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; ++j) {
+            for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
+                 ++i) {
                 const int idx              = acVertexBufferIdx(i, j, k, mesh.info);
-                const ModelScalar curr_val = reduce_initial(
-                    mesh.vertex_buffer[a][idx]);
-                res = reduce(res, curr_val);
+                const ModelScalar curr_val = reduce_initial(mesh.vertex_buffer[a][idx]);
+                res                        = reduce(res, curr_val);
             }
         }
     }
@@ -133,9 +133,8 @@ model_reduce_scal(const ModelMesh& mesh, const ReductionType& rtype,
 }
 
 ModelScalar
-model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype,
-                 const VertexBufferHandle& a, const VertexBufferHandle& b,
-                 const VertexBufferHandle& c)
+model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype, const VertexBufferHandle& a,
+                 const VertexBufferHandle& b, const VertexBufferHandle& c)
 {
     // ModelScalar (*reduce_initial)(ModelScalar, ModelScalar, ModelScalar);
     ReduceInitialVecFunc reduce_initial;
@@ -162,33 +161,34 @@ model_reduce_vec(const ModelMesh& mesh, const ReductionType& rtype,
         reduce         = sum;
         solve_mean     = true;
         break;
+    case RTYPE_SUM:
+        reduce_initial = length;
+        reduce         = sum;
+        break;
     default:
         ERROR("Unrecognized RTYPE");
     }
 
-    const int initial_idx = acVertexBufferIdx(
-        mesh.info.int_params[AC_nx_min], mesh.info.int_params[AC_ny_min],
-        mesh.info.int_params[AC_nz_min], mesh.info);
+    const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
+                                              mesh.info.int_params[AC_ny_min],
+                                              mesh.info.int_params[AC_nz_min], mesh.info);
 
     ModelScalar res;
     if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
-        res = reduce_initial(mesh.vertex_buffer[a][initial_idx],
-                             mesh.vertex_buffer[b][initial_idx],
+        res = reduce_initial(mesh.vertex_buffer[a][initial_idx], mesh.vertex_buffer[b][initial_idx],
                              mesh.vertex_buffer[c][initial_idx]);
     else
         res = 0;
 
-    for (int k = mesh.info.int_params[AC_nz_min];
-         k < mesh.info.int_params[AC_nz_max]; k++) {
-        for (int j = mesh.info.int_params[AC_ny_min];
-             j < mesh.info.int_params[AC_ny_max]; j++) {
-            for (int i = mesh.info.int_params[AC_nx_min];
-                 i < mesh.info.int_params[AC_nx_max]; i++) {
+    for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; k++) {
+        for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; j++) {
+            for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
+                 i++) {
                 const int idx              = acVertexBufferIdx(i, j, k, mesh.info);
-                const ModelScalar curr_val = reduce_initial(
-                    mesh.vertex_buffer[a][idx], mesh.vertex_buffer[b][idx],
-                    mesh.vertex_buffer[c][idx]);
-                res = reduce(res, curr_val);
+                const ModelScalar curr_val = reduce_initial(mesh.vertex_buffer[a][idx],
+                                                            mesh.vertex_buffer[b][idx],
+                                                            mesh.vertex_buffer[c][idx]);
+                res                        = reduce(res, curr_val);
             }
         }
     }