From 7f7b0b89ea0b9b259dc6aa4d4a7a5de241eca334 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 19 Aug 2020 12:03:15 +0300 Subject: [PATCH] Fetched improvements to benchmarks from the mpi-paper-benchmarks branch --- samples/benchmark/main.cc | 65 ------------------------------ samples/genbenchmarkscripts/main.c | 54 ++++++++++++++++++++----- 2 files changed, 44 insertions(+), 75 deletions(-) diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index 16a99df..733b064 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -149,30 +149,6 @@ main(int argc, char** argv) } }*/ - /* - // Basic - const size_t num_iters = 100; - - // Warmup - for (size_t i = 0; i < num_iters / 10; ++i) - acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON); - - // Benchmark - Timer t; - const AcReal dt = FLT_EPSILON; - - acGridSynchronizeStream(STREAM_ALL); - timer_reset(&t); - acGridSynchronizeStream(STREAM_ALL); - - for (size_t i = 0; i < num_iters; ++i) - acGridIntegrate(STREAM_DEFAULT, dt); - - acGridSynchronizeStream(STREAM_ALL); - if (!pid) - timer_diff_print(t); - acGridSynchronizeStream(STREAM_ALL); - */ // Percentiles const size_t num_iters = 1000; @@ -217,47 +193,6 @@ main(int argc, char** argv) fclose(fp); } - /* -const size_t num_iters = 1000; -const double nth_percentile = 0.90; - -std::vector results; // ms -results.reserve(num_iters); - -for (size_t i = 0; i < num_iters; ++i) { - acGridSynchronizeStream(STREAM_ALL); - timer_reset(&t); - acGridSynchronizeStream(STREAM_ALL); - acGridIntegrate(STREAM_DEFAULT, dt); - acGridSynchronizeStream(STREAM_ALL); - results.push_back(timer_diff_nsec(t) / 1e6); -} - -// Write benchmark to file -if (!pid) { - std::sort(results.begin(), results.end(), - [](const double& a, const double& b) { return a < b; }); - fprintf(stdout, - "Integration step time %g ms (%gth " - "percentile)--------------------------------------\n", - results[nth_percentile * num_iters], 100 * nth_percentile); - - char path[4096] = ""; - if (test == TEST_STRONG_SCALING) - strncpy(path, "strong_scaling.csv", sizeof(path)); - else if (test == TEST_WEAK_SCALING) - strncpy(path, "weak_scaling.csv", sizeof(path)); - else - ERROR("Invalid test type"); - - FILE* fp = fopen(path, "a"); - ERRCHK_ALWAYS(fp); - // Format - // nprocs, measured (ms) - fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]); - - fclose(fp); -}*/ acGridQuit(); MPI_Finalize(); diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index ce782ed..7be0872 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -2,11 +2,12 @@ #include #include #include +#include int main(void) { - const int max_nprocs = 128; + const int max_nprocs = 64; for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) { char filename[4096]; sprintf(filename, "benchmark_%d.sh", nprocs); @@ -18,10 +19,11 @@ main(void) fprintf(fp, "#!/bin/bash\n"); fprintf(fp, "#BATCH --job-name=astaroth\n"); fprintf(fp, "#SBATCH --account=project_2000403\n"); - fprintf(fp, "#SBATCH --time=00:14:59\n"); + fprintf(fp, "#SBATCH --time=03:00:00\n"); fprintf(fp, "#SBATCH --mem=32000\n"); fprintf(fp, "#SBATCH --partition=gpu\n"); - fprintf(fp, "#SBATCH --cpus-per-task=10\n"); + fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs); + // fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // nprocs, nodes, gpus const int max_gpus_per_node = 4; @@ -30,30 +32,62 @@ main(void) fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); fprintf(fp, "#SBATCH -n %d\n", nprocs); fprintf(fp, "#SBATCH -N %d\n", nodes); - //fprintf(fp, "#SBATCH --exclusive\n"); - if (nprocs > 4) + // fprintf(fp, "#SBATCH --exclusive\n"); + if (nprocs >= 4) fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); // Modules // OpenMPI - fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n"); + fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n"); + //fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa + //fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); + //if (nprocs >= 32) + // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa + // HPCX //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); - fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); + //fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 // Profile and run - //fprintf(fp, "mkdir -p profile_%d\n", nprocs); + // fprintf(fp, "mkdir -p profile_%d\n", nprocs); + /* const int nx = 256; // max size 1792; const int ny = nx; const int nz = nx; - /* + fprintf(fp, //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " //"%d\n", "srun ./benchmark %d %d %d\n", nx, ny, nz); */ - fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz); + // fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz); + + const char* files[] = { + "benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D", + "benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm", + "benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024", + "benchmark_meshsize_1792", "benchmark_stencilord_2", "benchmark_stencilord_4", + "benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control", + "benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default", + "benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256", + "benchmark_weak_448", + }; + for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) { + int nn = 256; + if (strcmp(files[i], "benchmark_meshsize_512") == 0) + nn = 512; + else if (strcmp(files[i], "benchmark_meshsize_1024") == 0) + nn = 1024; + else if (strcmp(files[i], "benchmark_meshsize_1792") == 0) + nn = 1792; + else if (strcmp(files[i], "benchmark_weak_128") == 0) + nn = 128; + else if (strcmp(files[i], "benchmark_weak_448") == 0) + nn = 448; + + fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn); + } fclose(fp); }