diff --git a/samples/mpi_reduce_bench/main.cc b/samples/mpi_reduce_bench/main.cc index 7c279d3..4b4298b 100644 --- a/samples/mpi_reduce_bench/main.cc +++ b/samples/mpi_reduce_bench/main.cc @@ -80,16 +80,15 @@ main(int argc, char** argv) // GPU alloc & compute acGridInit(info); + const size_t num_iters = 100; + const double nth_percentile = 0.90; + std::vector results; // ms + Timer t; + // Scalar benchmarks for (auto& testCase : scalarReductionTests) { - // Percentiles - const size_t num_iters = 100; - const double nth_percentile = 0.90; - std::vector results; // ms + results.clear(); results.reserve(num_iters); - // Benchmark - Timer t; - for (size_t i = 0; i < num_iters; ++i) { acGridSynchronizeStream(STREAM_ALL); timer_reset(&t); @@ -119,6 +118,41 @@ main(int argc, char** argv) fclose(fp); } } + + // Vector benchmarks + for (auto& testCase : vectorReductionTests) { + results.clear(); + results.reserve(num_iters); + + for (size_t i = 0; i < num_iters; ++i) { + acGridSynchronizeStream(STREAM_ALL); + timer_reset(&t); + acGridSynchronizeStream(STREAM_ALL); + acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c, &testCase.candidate); + acGridSynchronizeStream(STREAM_ALL); + results.push_back(timer_diff_nsec(t) / 1e6); + acGridSynchronizeStream(STREAM_ALL); + } + + if (!pid) { + std::sort(results.begin(), results.end(), + [](const double& a, const double& b) { return a < b; }); + fprintf(stdout, + "Reduction time %g ms (%gth " + "percentile)--------------------------------------\n", + results[nth_percentile * num_iters], 100 * nth_percentile); + + char path[4096] = "mpi_reduction_benchmark.csv"; + + FILE* fp = fopen(path, "a"); + ERRCHK_ALWAYS(fp); + + // Format + // benchmark label, test label, nprocs, measured (ms) + fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]); + fclose(fp); + } + } acGridQuit(); MPI_Finalize(); return EXIT_SUCCESS;