initial commit

2020-09-03 16:39:23 -06:00
commit f7e51a4b7d
7 changed files with 539 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.17)
+project(bench LANGUAGES CXX VERSION 0.1.0.0)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Set a default build type if none was specified
+set(default_build_type "Release")
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
+  set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+set(BENCH_SRCS src/bench.cpp)
+
+
+
+
+add_library(bench STATIC ${BENCH_SRCS})
+target_include_directories(bench PUBLIC include)
+set_property(TARGET bench PROPERTY CXX_STANDARD 11)
+set_property(TARGET bench PROPERTY CXX_EXTENSIONS OFF)
+set_property(TARGET bench PROPERTY CXX_STANDARD_REQUIRED ON)
+
+find_package(MPI)
+if (MPI_FOUND)
+    target_compile_definitions(bench PUBLIC -DBENCH_USE_MPI)
+    target_link_libraries(bench PUBLIC MPI::MPI_CXX)
+endif()
+
+add_executable(bench-allreduce bin/allreduce.cpp)
+target_link_libraries(bench-allreduce bench)
+
+add_executable(bench-pingpong bin/pingpong.cpp)
+target_link_libraries(bench-pingpong bench)
+
+add_executable(bench-empty bin/empty.cpp)
+target_link_libraries(bench-empty bench)
--- a/README.md
+++ b/README.md
@@ -0,0 +1,97 @@
+# bench
+
+Protoype C++11 MPI benchmark support library inspired by [google/benchmark](github.com/google/benchmark).
+
+## Benchmark Loop
+
+An example ping-pong benchmark (bin/pingpong.cpp)
+
+```c++
+void pingpong(bench::State &state) {
+
+  const int rank = bench::world_rank();
+  const int size = bench::world_size();
+
+  const size_t sz = 1;
+
+  char *sbuf = new char[sz];
+  char *rbuf = new char[sz];
+
+  for (auto _ : state) {
+    if (0 == rank) {
+      MPI_Send(sbuf, sz, MPI_BYTE, 1, 0, MPI_COMM_WORLD);
+      MPI_Recv(rbuf, sz, MPI_BYTE, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else if (1 == rank) {
+      MPI_Recv(rbuf, sz, MPI_BYTE, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+      MPI_Send(sbuf, sz, MPI_BYTE, 0, 0, MPI_COMM_WORLD);
+    }
+  }
+
+  state.set_bytes_processed(sz);
+  delete[] sbuf;
+  delete[] rbuf;
+}
+
+int main(int argc, char **argv) {
+
+  bench::init(argc, argv);
+  bench::register_bench("pingpong", pingpong)->timing_root_rank()->no_iter_barrier();
+  bench::run_benchmarks();
+  bench::finalize();
+}
+```
+
+The library will automatically determine the number of iterations to run.
+
+Before the `pingpong` function is called, the library will call `MPI_Barrier(MPI_COMM_WORLD)`.
+Then, `pingpong` will be called.
+Setup code happens before the `auto _ : state` loop.
+Each iteration of the loop contributes to the total time.
+After each iteration, an `MPI_Barrier(MPI_COMM_WORLD)` is invoked, it's time does not contribute (see `Benchmark::no_iter_barrier()`.
+After the loop, benchmark-specific teardown occurs.
+`timing_root_rank()` says that the reported timing should be tracked just by elapsed time on the root rank.
+`no_iter_barrier()` says that there should be no `MPI_Barrier()` between state iterations.
+
+## Reporting
+
+The reported time the average ns/iteration.
+If `state.set_bytes_processed` is used, the provided value should be the number of bytes per iteration.
+The reported number of bytes will be bytes / second.
+
+## 
+
+* `Benchmark::timing_max_rank()`: 
+* `Benchmark::timing_root_rank()`: 
+* `Benchmark::no_iter_barrier()`: Do not do an `MPI_Barrier()` between iterations.
+
+## Roadmap
+
+- [ ] Automatic Timing
+ - [x] `timing_root_rank`: only record time in rank 0
+ - [x] `timing_max_rank`: report the maximum time consumed across all ranks
+ - [ ] `timing_wall`: the wall time from the first rank starts to the last rank ends
+ - [ ] `timing_aggregate`: aggregate time consumed in each rank
+- [ ] Manual timing
+  - [x] state.pause_timing()
+  - [x] state.resume_timing()
+  - [ ] state.set_iteration_time()
+- [ ] Iteration control
+  - [ ] manual
+  - [ ] automatic
+- [ ] Support running a benchmark over multiple communicators
+  - [ ] Benchmark must take a communicator
+  - [ ] All pairs of ranks
+  - [ ] Specific pairs of ranks
+- [ ] CSV reporter
+- [ ] Add arguments to a benchmark
+- [ ] Add statistics for repeated runs
+  - [ ] trimean
+  - [ ] standard deviation
+  - [ ] min
+  - [ ] max
+- [ ] JSON reporter
+- [ ] Benchmark registration
+  - [ ] static
+    - [ ] Auto-generated main function
+  - [x] function pointer
+  - [ ] lambda function
--- a/bin/allreduce.cpp
+++ b/bin/allreduce.cpp
@@ -0,0 +1,27 @@
+#include "bench/bench.hpp"
+
+#include <mpi.h>
+
+void allreduce(bench::State &state) {
+
+
+    const int rank = bench::world_rank();
+    const int size = bench::world_size();
+
+    const size_t sz = 1000;
+
+    char *data = new char[sz];
+    for (auto _ : state) {
+        MPI_Allreduce(MPI_IN_PLACE, data, sz, MPI_BYTE, MPI_SUM, MPI_COMM_WORLD);
+    }
+
+    state.set_bytes_processed(sz * size);
+    delete[] data;
+}
+
+int main(int argc, char **argv) {
+    bench::init(argc, argv);
+    bench::register_bench("allreduce", allreduce)->timing_max_rank();
+    bench::run_benchmarks();
+    bench::finalize();
+}
--- a/bin/empty.cpp
+++ b/bin/empty.cpp
@@ -0,0 +1,16 @@
+#include "bench/bench.hpp"
+
+#include <mpi.h>
+
+void empty(bench::State &state) {
+  for (auto _ : state) {
+  }
+}
+
+int main(int argc, char **argv) {
+
+  bench::init(argc, argv);
+  bench::register_bench("empty", empty)->timing_root_rank()->no_iter_barrier();
+  bench::run_benchmarks();
+  bench::finalize();
+}
--- a/bin/pingpong.cpp
+++ b/bin/pingpong.cpp
@@ -0,0 +1,38 @@
+#include "bench/bench.hpp"
+
+#include <mpi.h>
+
+#include <iostream>
+
+void pingpong(bench::State &state) {
+
+  const int rank = bench::world_rank();
+  const int size = bench::world_size();
+
+  const size_t sz = 1;
+
+  char *sbuf = new char[sz];
+  char *rbuf = new char[sz];
+
+  for (auto _ : state) {
+    if (0 == rank) {
+      MPI_Send(sbuf, sz, MPI_BYTE, 1, 0, MPI_COMM_WORLD);
+      MPI_Recv(rbuf, sz, MPI_BYTE, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else if (1 == rank) {
+      MPI_Recv(rbuf, sz, MPI_BYTE, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+      MPI_Send(sbuf, sz, MPI_BYTE, 0, 0, MPI_COMM_WORLD);
+    }
+  }
+
+  state.set_bytes_processed(sz);
+  delete[] sbuf;
+  delete[] rbuf;
+}
+
+int main(int argc, char **argv) {
+
+  bench::init(argc, argv);
+  bench::register_bench("pingpong", pingpong)->timing_root_rank()->no_iter_barrier();
+  bench::run_benchmarks();
+  bench::finalize();
+}
--- a/include/bench/bench.hpp
+++ b/include/bench/bench.hpp
@@ -0,0 +1,231 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <vector>
+
+#ifdef BENCH_USE_MPI
+#include <mpi.h>
+#endif
+
+#define BENCH_ALWAYS_INLINE __attribute__((always_inline))
+
+namespace bench {
+
+void init(int &argc, char **&argv);
+void finalize();
+int world_rank();
+int world_size();
+void run_benchmarks();
+
+class Timer {
+public:
+  virtual void pause() = 0;
+  virtual void resume() = 0;
+  virtual void finalize() = 0;
+  virtual double get_elapsed() = 0;
+};
+class WallTimer : public Timer {
+
+  typedef std::chrono::high_resolution_clock Clock;
+  typedef std::chrono::nanoseconds Duration;
+  std::chrono::time_point<Clock> start_;
+
+protected:
+  double elapsed_;
+  bool paused_;
+
+public:
+  WallTimer() : elapsed_(0), paused_(true) {}
+  virtual void pause() {
+    if (!paused_) {
+      Duration elapsed = Clock::now() - start_;
+      elapsed_ += elapsed.count();
+      paused_ = true;
+    }
+  }
+  virtual void resume() {
+    if (paused_) {
+      start_ = Clock::now();
+      paused_ = false;
+    }
+  }
+
+  virtual void finalize() { /* no-op */
+  }
+  virtual double get_elapsed() {
+    pause();
+    return elapsed_;
+  }
+};
+
+class MaxRankTimer : public WallTimer {
+public:
+  virtual void finalize() {
+#ifdef BENCH_USE_MPI
+    double myElapsed = elapsed_;
+    MPI_Allreduce(&myElapsed, &elapsed_, 1, MPI_DOUBLE, MPI_MAX,
+                  MPI_COMM_WORLD);
+#endif
+  }
+};
+
+class NoOpTimer : public Timer {
+  virtual void pause() {}
+  virtual void resume() {}
+  virtual void finalize() {}
+  virtual double get_elapsed() { return 0; }
+};
+
+class State {
+public:
+  struct Iterator;
+  friend struct Iterator;
+
+  State(uint64_t iterations, Timer *timer, bool iterBarrier)
+      : iterations_(iterations), bytesProcessed_(0), error_(false),
+        timer_(timer), iterBarrier_(iterBarrier) {}
+
+  Iterator begin();
+  Iterator end();
+
+  void start_running() { timer_->resume(); }
+  void finish_running() { timer_->pause(); }
+  void set_bytes_processed(uint64_t n) { bytesProcessed_ = n; }
+  const uint64_t bytes_processed() const { return bytesProcessed_; }
+  const uint64_t iterations() const { return iterations_; }
+
+private:
+  uint64_t iterations_;
+  uint64_t bytesProcessed_;
+  bool error_;
+  Timer *timer_;
+  bool iterBarrier_;
+};
+
+struct State::Iterator {
+  struct Value {};
+
+private:
+  State *parent_;
+
+  // cached to prevent indirect lookup in parent
+  uint64_t remaining_;
+  bool iterBarrier_;
+
+  friend class State;
+  Iterator() : parent_(nullptr), remaining_(0) {}
+  explicit Iterator(State *state)
+      : parent_(state), remaining_(state->iterations_),
+        iterBarrier_(state->iterBarrier_) {}
+
+public:
+  Value operator*() const { return Value(); }
+
+  BENCH_ALWAYS_INLINE bool operator!=(const State::Iterator &rhs) {
+
+#ifdef BENCH_USE_MPI
+    if (iterBarrier_) {
+      // timing was paused in operator++
+      MPI_Barrier(MPI_COMM_WORLD);
+      resume_timing();
+    }
+#endif
+
+    // if (__builtin_expect(remaining_ != 0, true)) {
+    if (remaining_ != 0, false) {
+      return true;
+    }
+    parent_->finish_running();
+    return false;
+  }
+
+  BENCH_ALWAYS_INLINE Iterator &operator++() {
+
+#ifdef BENCH_USE_MPI
+    if (iterBarrier_) {
+      // pause timer before barrier
+      pause_timing();
+    }
+#endif
+
+    --remaining_;
+    return *this;
+  }
+
+  BENCH_ALWAYS_INLINE void pause_timing() { parent_->timer_->pause(); }
+  BENCH_ALWAYS_INLINE void resume_timing() { parent_->timer_->resume(); }
+};
+
+inline State::Iterator State::begin() { return State::Iterator(this); }
+inline State::Iterator State::end() {
+  start_running();
+  return State::Iterator();
+}
+
+class Benchmark {
+public:
+  Benchmark(const char *name)
+      : name_(name), timer_(new NoOpTimer()), iterBarrier_(true) {}
+  virtual ~Benchmark() {}
+  virtual void run(State &state) = 0;
+  const char *name() { return name_; }
+  Timer *timer() { return timer_; }
+  bool iter_barrier() { return iterBarrier_; }
+
+  // only record time at rank 0
+  Benchmark *timing_root_rank() {
+    if (timer_) {
+      delete timer_;
+    }
+
+    if (world_rank() == 0) {
+      timer_ = new WallTimer();
+    } else {
+      timer_ = new NoOpTimer();
+    }
+  }
+
+  // record time in each rank, and do a max reduction across all ranks at the
+  Benchmark *timing_max_rank() { timer_ = new MaxRankTimer(); }
+
+  // record the wall time for all ranks to finish
+  Benchmark *timing_wall();
+
+  // record the aggregate time across all ranks
+  Benchmark *timing_aggregate();
+
+  // dont do mpi_barrier between iterations
+  Benchmark *no_iter_barrier() { iterBarrier_ = false; }
+
+private:
+  const char *name_;
+  Timer *timer_;
+  bool iterBarrier_;
+};
+
+extern std::vector<Benchmark *> benchmarks;
+
+typedef void (*Function)(State &);
+
+class FunctionBenchmark : public Benchmark {
+public:
+  FunctionBenchmark(const char *name, Function fn) : Benchmark(name), fn_(fn) {}
+  virtual void run(State &state);
+
+private:
+  Function fn_;
+};
+
+Benchmark *register_bench(const char *name, Function fn);
+
+#if 0
+template <typename Fn> Benchmark *register_bench(const char *name, Fn &&fn) {}
+
+template <class Fn, class... Args>
+Benchmark *RegisterBenchmark(const char *name, Fn &&fn, Args &&... args) {
+  return register_bench(name, [=](State &st) { fn(st, args...); });
+}
+#endif
+
+} // namespace bench
--- a/src/bench.cpp
+++ b/src/bench.cpp
@@ -0,0 +1,90 @@
+#include "bench/bench.hpp"
+
+#ifdef BENCH_USE_MPI
+#include <mpi.h>
+#endif
+
+#include <iostream>
+#include <vector>
+
+namespace bench {
+
+void init(int &argc, char **&argv) {
+#ifdef BENCH_USE_MPI
+  MPI_Init(&argc, &argv);
+#endif
+}
+
+void finalize() {
+#ifdef BENCH_USE_MPI
+  MPI_Finalize();
+#endif
+}
+
+int world_rank() {
+#ifdef BENCH_USE_MPI
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  return rank;
+#endif
+  return 0;
+}
+
+int world_size() {
+#ifdef BENCH_USE_MPI
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  return size;
+#endif
+  return 1;
+}
+
+void run_benchmarks() {
+
+  for (Benchmark *benchmark : benchmarks) {
+    if (0 == world_rank()) {
+      std::cerr << "running " << benchmark->name() << "\n";
+    }
+
+    // estimate the time per iteration
+
+    // decide how many iterations to run
+    uint64_t iters = 10000;
+
+    State state(iters, benchmark->timer(), benchmark->iter_barrier());
+
+#ifdef BENCH_USE_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+
+    benchmark->run(state);
+
+    /*reporter
+     */
+    if (world_rank() == 0) {
+
+      double iters = state.iterations();
+      double nsElapsed = benchmark->timer()->get_elapsed() / iters;
+      double sElapsed = nsElapsed / 1e9;
+      double bytes = state.bytes_processed();
+
+
+      std::cout << benchmark->name() << ": " << nsElapsed << "ns";
+      if (state.bytes_processed()) {
+        std::cout << " " << bytes / sElapsed << "B/s";
+      }
+      std::cout << "\n";
+    }
+  }
+}
+
+Benchmark *register_bench(const char *name, Function fn) {
+  benchmarks.push_back(new FunctionBenchmark(name, fn));
+  return benchmarks.back();
+}
+
+void FunctionBenchmark::run(State &state) { fn_(state); }
+
+/*extern*/ std::vector<Benchmark *> benchmarks;
+
+} // namespace bench