diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5a514b..e1e762f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,11 @@ if (BUILD_SAMPLES)
add_subdirectory(samples/cpptest)
add_subdirectory(samples/mpitest)
add_subdirectory(samples/benchmark)
+ add_subdirectory(samples/mpi_reduce_bench)
+ add_custom_target(copy-mpibench-script ALL
+ COMMAND ${CMAKE_COMMAND} -E copy
+ ${CMAKE_SOURCE_DIR}/samples/mpi_reduce_bench/mpibench.sh
+ ${CMAKE_CURRENT_BINARY_DIR}/mpibench.sh)
add_subdirectory(samples/bwtest)
add_subdirectory(samples/genbenchmarkscripts)
endif()
diff --git a/samples/mpi_reduce_bench/CMakeLists.txt b/samples/mpi_reduce_bench/CMakeLists.txt
new file mode 100644
index 0000000..b04f80a
--- /dev/null
+++ b/samples/mpi_reduce_bench/CMakeLists.txt
@@ -0,0 +1,3 @@
+## benchmark
+add_executable(mpi_reduce_bench main.cc)
+target_link_libraries(mpi_reduce_bench astaroth_core astaroth_utils)
diff --git a/samples/mpi_reduce_bench/main.cc b/samples/mpi_reduce_bench/main.cc
new file mode 100644
index 0000000..7c279d3
--- /dev/null
+++ b/samples/mpi_reduce_bench/main.cc
@@ -0,0 +1,135 @@
+/*
+ Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
+
+ This file is part of Astaroth.
+
+ Astaroth is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Astaroth is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Astaroth. If not, see .
+*/
+/**
+ Running: benchmark -np
+*/
+#include "astaroth.h"
+#include "astaroth_utils.h"
+
+#include "errchk.h"
+#include "timer_hires.h"
+
+#if AC_MPI_ENABLED
+
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+int
+main(int argc, char** argv)
+{
+ MPI_Init(NULL, NULL);
+ int nprocs, pid;
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid);
+
+ // CPU alloc
+ AcMeshInfo info;
+ acLoadConfig(AC_DEFAULT_CONFIG, &info);
+
+ char* benchmark_label;
+
+ if (argc > 1){
+ benchmark_label = argv[1];
+ } else {
+ auto timestamp = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+ benchmark_label = std::ctime(×tamp);
+ benchmark_label[strcspn(benchmark_label, "\n")] = '\0';
+ }
+
+ //clang-format off
+ std::vector scalarReductionTests {
+ acCreateScalReductionTestCase("Scalar MAX" , VTXBUF_UUX, RTYPE_MAX),
+ acCreateScalReductionTestCase("Scalar MIN" , VTXBUF_UUX, RTYPE_MIN),
+ acCreateScalReductionTestCase("Scalar RMS" , VTXBUF_UUX, RTYPE_RMS),
+ acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
+ acCreateScalReductionTestCase("Scalar SUM" , VTXBUF_UUX, RTYPE_SUM)
+ };
+
+ std::vector vectorReductionTests {
+ acCreateVecReductionTestCase("Vector MAX" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
+ acCreateVecReductionTestCase("Vector MIN" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
+ acCreateVecReductionTestCase("Vector RMS" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
+ acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP),
+ acCreateVecReductionTestCase("Vector SUM" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM)
+ };
+ //clang-format on
+
+ // GPU alloc & compute
+ acGridInit(info);
+
+ for (auto& testCase : scalarReductionTests) {
+ // Percentiles
+ const size_t num_iters = 100;
+ const double nth_percentile = 0.90;
+ std::vector results; // ms
+ results.reserve(num_iters);
+
+ // Benchmark
+ Timer t;
+
+ for (size_t i = 0; i < num_iters; ++i) {
+ acGridSynchronizeStream(STREAM_ALL);
+ timer_reset(&t);
+ acGridSynchronizeStream(STREAM_ALL);
+ acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
+ acGridSynchronizeStream(STREAM_ALL);
+ results.push_back(timer_diff_nsec(t) / 1e6);
+ acGridSynchronizeStream(STREAM_ALL);
+ }
+
+ if (!pid) {
+ std::sort(results.begin(), results.end(),
+ [](const double& a, const double& b) { return a < b; });
+ fprintf(stdout,
+ "Reduction time %g ms (%gth "
+ "percentile)--------------------------------------\n",
+ results[nth_percentile * num_iters], 100 * nth_percentile);
+
+ char path[4096] = "mpi_reduction_benchmark.csv";
+
+ FILE* fp = fopen(path, "a");
+ ERRCHK_ALWAYS(fp);
+
+ // Format
+ // benchmark label, test label, nprocs, measured (ms)
+ fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
+ fclose(fp);
+ }
+ }
+ acGridQuit();
+ MPI_Finalize();
+ return EXIT_SUCCESS;
+}
+
+#else
+int
+main(void)
+{
+ printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
+ "cmake -DMPI_ENABLED=ON .. to enable.\n");
+ return EXIT_FAILURE;
+}
+#endif // AC_MPI_ENABLES
diff --git a/samples/mpi_reduce_bench/mpibench.sh b/samples/mpi_reduce_bench/mpibench.sh
new file mode 100755
index 0000000..6517bf7
--- /dev/null
+++ b/samples/mpi_reduce_bench/mpibench.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+#defaults
+default_num_procs=8
+default_num_nodes=2
+
+num_procs=$default_num_procs
+num_nodes=$default_num_nodes
+
+script_name=$0
+
+print_usage(){
+ echo "Usage: $script_name [Options]"
+ echo "\tRuns mpi_reduce_bench, which will write benchmark results"
+ echo "Options:"
+ echo "\t -n "
+ echo "\t\t-n option to slurm, default=$default_num_procs"
+ echo "\t -N "
+ echo "\t\t-N option to slurm, default=$default_num_nodes"
+ echo "\t -t "
+ echo "\t\tA benchmark tag that will be added to the mpi_reduction_benchmark.csv file"
+ echo "\t\tBy default the current git HEAD short hash will be used as a tag"
+}
+
+while getopts n:N:t: opt
+do
+ case "$opt" in
+ n)
+ if [ $OPTARG ]
+ then
+ num_procs=$OPTARG
+ else
+ print_usage
+ exit 1
+ fi
+ ;;
+ N)
+ if [ $OPTARG ]
+ then
+ num_nodes=$OPTARG
+ else
+ print_usage
+ exit 1
+ fi
+ ;;
+ t)
+ if [ $OPTARG ]
+ then
+ benchmark_label=$OPTARG
+ else
+ print_usage
+ exit 1
+ fi
+ ;;
+ esac
+done
+
+if [ -z "$benchmark_label" ]
+then
+ benchmark_label=$(git rev-parse --short HEAD)
+fi
+set -x
+srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p gpu -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}