initial commit

This commit is contained in:
Carl Pearson
2020-09-03 16:39:23 -06:00
commit f7e51a4b7d
7 changed files with 539 additions and 0 deletions

40
CMakeLists.txt Normal file
View File

@@ -0,0 +1,40 @@
cmake_minimum_required(VERSION 3.17)
project(bench LANGUAGES CXX VERSION 0.1.0.0)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# Set a default build type if none was specified
set(default_build_type "Release")
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
STRING "Choose the type of build." FORCE)
# Set the possible values of build type for cmake-gui
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
"Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()
set(BENCH_SRCS src/bench.cpp)
add_library(bench STATIC ${BENCH_SRCS})
target_include_directories(bench PUBLIC include)
set_property(TARGET bench PROPERTY CXX_STANDARD 11)
set_property(TARGET bench PROPERTY CXX_EXTENSIONS OFF)
set_property(TARGET bench PROPERTY CXX_STANDARD_REQUIRED ON)
find_package(MPI)
if (MPI_FOUND)
target_compile_definitions(bench PUBLIC -DBENCH_USE_MPI)
target_link_libraries(bench PUBLIC MPI::MPI_CXX)
endif()
add_executable(bench-allreduce bin/allreduce.cpp)
target_link_libraries(bench-allreduce bench)
add_executable(bench-pingpong bin/pingpong.cpp)
target_link_libraries(bench-pingpong bench)
add_executable(bench-empty bin/empty.cpp)
target_link_libraries(bench-empty bench)

97
README.md Normal file
View File

@@ -0,0 +1,97 @@
# bench
Protoype C++11 MPI benchmark support library inspired by [google/benchmark](github.com/google/benchmark).
## Benchmark Loop
An example ping-pong benchmark (bin/pingpong.cpp)
```c++
void pingpong(bench::State &state) {
const int rank = bench::world_rank();
const int size = bench::world_size();
const size_t sz = 1;
char *sbuf = new char[sz];
char *rbuf = new char[sz];
for (auto _ : state) {
if (0 == rank) {
MPI_Send(sbuf, sz, MPI_BYTE, 1, 0, MPI_COMM_WORLD);
MPI_Recv(rbuf, sz, MPI_BYTE, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
} else if (1 == rank) {
MPI_Recv(rbuf, sz, MPI_BYTE, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Send(sbuf, sz, MPI_BYTE, 0, 0, MPI_COMM_WORLD);
}
}
state.set_bytes_processed(sz);
delete[] sbuf;
delete[] rbuf;
}
int main(int argc, char **argv) {
bench::init(argc, argv);
bench::register_bench("pingpong", pingpong)->timing_root_rank()->no_iter_barrier();
bench::run_benchmarks();
bench::finalize();
}
```
The library will automatically determine the number of iterations to run.
Before the `pingpong` function is called, the library will call `MPI_Barrier(MPI_COMM_WORLD)`.
Then, `pingpong` will be called.
Setup code happens before the `auto _ : state` loop.
Each iteration of the loop contributes to the total time.
After each iteration, an `MPI_Barrier(MPI_COMM_WORLD)` is invoked, it's time does not contribute (see `Benchmark::no_iter_barrier()`.
After the loop, benchmark-specific teardown occurs.
`timing_root_rank()` says that the reported timing should be tracked just by elapsed time on the root rank.
`no_iter_barrier()` says that there should be no `MPI_Barrier()` between state iterations.
## Reporting
The reported time the average ns/iteration.
If `state.set_bytes_processed` is used, the provided value should be the number of bytes per iteration.
The reported number of bytes will be bytes / second.
##
* `Benchmark::timing_max_rank()`:
* `Benchmark::timing_root_rank()`:
* `Benchmark::no_iter_barrier()`: Do not do an `MPI_Barrier()` between iterations.
## Roadmap
- [ ] Automatic Timing
- [x] `timing_root_rank`: only record time in rank 0
- [x] `timing_max_rank`: report the maximum time consumed across all ranks
- [ ] `timing_wall`: the wall time from the first rank starts to the last rank ends
- [ ] `timing_aggregate`: aggregate time consumed in each rank
- [ ] Manual timing
- [x] state.pause_timing()
- [x] state.resume_timing()
- [ ] state.set_iteration_time()
- [ ] Iteration control
- [ ] manual
- [ ] automatic
- [ ] Support running a benchmark over multiple communicators
- [ ] Benchmark must take a communicator
- [ ] All pairs of ranks
- [ ] Specific pairs of ranks
- [ ] CSV reporter
- [ ] Add arguments to a benchmark
- [ ] Add statistics for repeated runs
- [ ] trimean
- [ ] standard deviation
- [ ] min
- [ ] max
- [ ] JSON reporter
- [ ] Benchmark registration
- [ ] static
- [ ] Auto-generated main function
- [x] function pointer
- [ ] lambda function

27
bin/allreduce.cpp Normal file
View File

@@ -0,0 +1,27 @@
#include "bench/bench.hpp"
#include <mpi.h>
void allreduce(bench::State &state) {
const int rank = bench::world_rank();
const int size = bench::world_size();
const size_t sz = 1000;
char *data = new char[sz];
for (auto _ : state) {
MPI_Allreduce(MPI_IN_PLACE, data, sz, MPI_BYTE, MPI_SUM, MPI_COMM_WORLD);
}
state.set_bytes_processed(sz * size);
delete[] data;
}
int main(int argc, char **argv) {
bench::init(argc, argv);
bench::register_bench("allreduce", allreduce)->timing_max_rank();
bench::run_benchmarks();
bench::finalize();
}

16
bin/empty.cpp Normal file
View File

@@ -0,0 +1,16 @@
#include "bench/bench.hpp"
#include <mpi.h>
void empty(bench::State &state) {
for (auto _ : state) {
}
}
int main(int argc, char **argv) {
bench::init(argc, argv);
bench::register_bench("empty", empty)->timing_root_rank()->no_iter_barrier();
bench::run_benchmarks();
bench::finalize();
}

38
bin/pingpong.cpp Normal file
View File

@@ -0,0 +1,38 @@
#include "bench/bench.hpp"
#include <mpi.h>
#include <iostream>
void pingpong(bench::State &state) {
const int rank = bench::world_rank();
const int size = bench::world_size();
const size_t sz = 1;
char *sbuf = new char[sz];
char *rbuf = new char[sz];
for (auto _ : state) {
if (0 == rank) {
MPI_Send(sbuf, sz, MPI_BYTE, 1, 0, MPI_COMM_WORLD);
MPI_Recv(rbuf, sz, MPI_BYTE, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
} else if (1 == rank) {
MPI_Recv(rbuf, sz, MPI_BYTE, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Send(sbuf, sz, MPI_BYTE, 0, 0, MPI_COMM_WORLD);
}
}
state.set_bytes_processed(sz);
delete[] sbuf;
delete[] rbuf;
}
int main(int argc, char **argv) {
bench::init(argc, argv);
bench::register_bench("pingpong", pingpong)->timing_root_rank()->no_iter_barrier();
bench::run_benchmarks();
bench::finalize();
}

231
include/bench/bench.hpp Normal file
View File

@@ -0,0 +1,231 @@
#pragma once
#include <chrono>
#include <cstdint>
#include <vector>
#ifdef BENCH_USE_MPI
#include <mpi.h>
#endif
#define BENCH_ALWAYS_INLINE __attribute__((always_inline))
namespace bench {
void init(int &argc, char **&argv);
void finalize();
int world_rank();
int world_size();
void run_benchmarks();
class Timer {
public:
virtual void pause() = 0;
virtual void resume() = 0;
virtual void finalize() = 0;
virtual double get_elapsed() = 0;
};
class WallTimer : public Timer {
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::nanoseconds Duration;
std::chrono::time_point<Clock> start_;
protected:
double elapsed_;
bool paused_;
public:
WallTimer() : elapsed_(0), paused_(true) {}
virtual void pause() {
if (!paused_) {
Duration elapsed = Clock::now() - start_;
elapsed_ += elapsed.count();
paused_ = true;
}
}
virtual void resume() {
if (paused_) {
start_ = Clock::now();
paused_ = false;
}
}
virtual void finalize() { /* no-op */
}
virtual double get_elapsed() {
pause();
return elapsed_;
}
};
class MaxRankTimer : public WallTimer {
public:
virtual void finalize() {
#ifdef BENCH_USE_MPI
double myElapsed = elapsed_;
MPI_Allreduce(&myElapsed, &elapsed_, 1, MPI_DOUBLE, MPI_MAX,
MPI_COMM_WORLD);
#endif
}
};
class NoOpTimer : public Timer {
virtual void pause() {}
virtual void resume() {}
virtual void finalize() {}
virtual double get_elapsed() { return 0; }
};
class State {
public:
struct Iterator;
friend struct Iterator;
State(uint64_t iterations, Timer *timer, bool iterBarrier)
: iterations_(iterations), bytesProcessed_(0), error_(false),
timer_(timer), iterBarrier_(iterBarrier) {}
Iterator begin();
Iterator end();
void start_running() { timer_->resume(); }
void finish_running() { timer_->pause(); }
void set_bytes_processed(uint64_t n) { bytesProcessed_ = n; }
const uint64_t bytes_processed() const { return bytesProcessed_; }
const uint64_t iterations() const { return iterations_; }
private:
uint64_t iterations_;
uint64_t bytesProcessed_;
bool error_;
Timer *timer_;
bool iterBarrier_;
};
struct State::Iterator {
struct Value {};
private:
State *parent_;
// cached to prevent indirect lookup in parent
uint64_t remaining_;
bool iterBarrier_;
friend class State;
Iterator() : parent_(nullptr), remaining_(0) {}
explicit Iterator(State *state)
: parent_(state), remaining_(state->iterations_),
iterBarrier_(state->iterBarrier_) {}
public:
Value operator*() const { return Value(); }
BENCH_ALWAYS_INLINE bool operator!=(const State::Iterator &rhs) {
#ifdef BENCH_USE_MPI
if (iterBarrier_) {
// timing was paused in operator++
MPI_Barrier(MPI_COMM_WORLD);
resume_timing();
}
#endif
// if (__builtin_expect(remaining_ != 0, true)) {
if (remaining_ != 0, false) {
return true;
}
parent_->finish_running();
return false;
}
BENCH_ALWAYS_INLINE Iterator &operator++() {
#ifdef BENCH_USE_MPI
if (iterBarrier_) {
// pause timer before barrier
pause_timing();
}
#endif
--remaining_;
return *this;
}
BENCH_ALWAYS_INLINE void pause_timing() { parent_->timer_->pause(); }
BENCH_ALWAYS_INLINE void resume_timing() { parent_->timer_->resume(); }
};
inline State::Iterator State::begin() { return State::Iterator(this); }
inline State::Iterator State::end() {
start_running();
return State::Iterator();
}
class Benchmark {
public:
Benchmark(const char *name)
: name_(name), timer_(new NoOpTimer()), iterBarrier_(true) {}
virtual ~Benchmark() {}
virtual void run(State &state) = 0;
const char *name() { return name_; }
Timer *timer() { return timer_; }
bool iter_barrier() { return iterBarrier_; }
// only record time at rank 0
Benchmark *timing_root_rank() {
if (timer_) {
delete timer_;
}
if (world_rank() == 0) {
timer_ = new WallTimer();
} else {
timer_ = new NoOpTimer();
}
}
// record time in each rank, and do a max reduction across all ranks at the
Benchmark *timing_max_rank() { timer_ = new MaxRankTimer(); }
// record the wall time for all ranks to finish
Benchmark *timing_wall();
// record the aggregate time across all ranks
Benchmark *timing_aggregate();
// dont do mpi_barrier between iterations
Benchmark *no_iter_barrier() { iterBarrier_ = false; }
private:
const char *name_;
Timer *timer_;
bool iterBarrier_;
};
extern std::vector<Benchmark *> benchmarks;
typedef void (*Function)(State &);
class FunctionBenchmark : public Benchmark {
public:
FunctionBenchmark(const char *name, Function fn) : Benchmark(name), fn_(fn) {}
virtual void run(State &state);
private:
Function fn_;
};
Benchmark *register_bench(const char *name, Function fn);
#if 0
template <typename Fn> Benchmark *register_bench(const char *name, Fn &&fn) {}
template <class Fn, class... Args>
Benchmark *RegisterBenchmark(const char *name, Fn &&fn, Args &&... args) {
return register_bench(name, [=](State &st) { fn(st, args...); });
}
#endif
} // namespace bench

90
src/bench.cpp Normal file
View File

@@ -0,0 +1,90 @@
#include "bench/bench.hpp"
#ifdef BENCH_USE_MPI
#include <mpi.h>
#endif
#include <iostream>
#include <vector>
namespace bench {
void init(int &argc, char **&argv) {
#ifdef BENCH_USE_MPI
MPI_Init(&argc, &argv);
#endif
}
void finalize() {
#ifdef BENCH_USE_MPI
MPI_Finalize();
#endif
}
int world_rank() {
#ifdef BENCH_USE_MPI
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
return rank;
#endif
return 0;
}
int world_size() {
#ifdef BENCH_USE_MPI
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
return size;
#endif
return 1;
}
void run_benchmarks() {
for (Benchmark *benchmark : benchmarks) {
if (0 == world_rank()) {
std::cerr << "running " << benchmark->name() << "\n";
}
// estimate the time per iteration
// decide how many iterations to run
uint64_t iters = 10000;
State state(iters, benchmark->timer(), benchmark->iter_barrier());
#ifdef BENCH_USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
benchmark->run(state);
/*reporter
*/
if (world_rank() == 0) {
double iters = state.iterations();
double nsElapsed = benchmark->timer()->get_elapsed() / iters;
double sElapsed = nsElapsed / 1e9;
double bytes = state.bytes_processed();
std::cout << benchmark->name() << ": " << nsElapsed << "ns";
if (state.bytes_processed()) {
std::cout << " " << bytes / sElapsed << "B/s";
}
std::cout << "\n";
}
}
}
Benchmark *register_bench(const char *name, Function fn) {
benchmarks.push_back(new FunctionBenchmark(name, fn));
return benchmarks.back();
}
void FunctionBenchmark::run(State &state) { fn_(state); }
/*extern*/ std::vector<Benchmark *> benchmarks;
} // namespace bench