From c1f2a6c3401912698c638f62f4bb677f373cb74a Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 28 Oct 2020 12:55:32 +0200 Subject: [PATCH] Setup for benchmarks --- CMakeLists.txt | 10 +- samples/benchmark/main.cc | 2 +- .../genbenchmarkscripts/CMakeLists.txt | 8 ++ .../genbenchmarkscripts/main.c | 120 ++++++++++++++++++ samples/genbenchmarkscripts/main.c | 72 +++++++---- scripts/buildtestcases.sh | 56 ++++++++ scripts/postprocess_benchmarks.sh | 41 ++++++ 7 files changed, 279 insertions(+), 30 deletions(-) create mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt create mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/main.c create mode 100755 scripts/buildtestcases.sh create mode 100755 scripts/postprocess_benchmarks.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 66b8001..8bfc25e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ project(astaroth C CXX CUDA) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) ## Project-wide compilation flags -set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface +set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") set(CMAKE_C_STANDARD 11) @@ -38,10 +38,10 @@ endif() message(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) ## Options -option(DOUBLE_PRECISION "Generates double precision code." OFF) +option(DOUBLE_PRECISION "Generates double precision code." ON) option(BUILD_SAMPLES "Builds projects in samples subdirectory." ON) -option(MPI_ENABLED "Enables additional functions for MPI communciation." OFF) -option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON) +option(MPI_ENABLED "Enables additional functions for MPI communciation." ON) +option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF) option(VERBOSE "Enables various status and warning messages" OFF) ## Options (DEPRECATED) @@ -110,7 +110,7 @@ if (BUILD_SAMPLES) add_subdirectory(samples/cpptest) add_subdirectory(samples/mpitest) add_subdirectory(samples/benchmark) - #add_subdirectory(samples/genbenchmarkscripts) + add_subdirectory(samples/genbenchmarkscripts) #add_subdirectory(samples/mpi_reduce_bench) add_subdirectory(samples/fortrantest) diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index f205b04..3c5e504 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -154,7 +154,7 @@ main(int argc, char** argv) }*/ // Percentiles - const size_t num_iters = 1000; + const size_t num_iters = 100; const double nth_percentile = 0.90; std::vector results; // ms results.reserve(num_iters); diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt b/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt new file mode 100644 index 0000000..6115fde --- /dev/null +++ b/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(genbenchmarkscripts main.c) + +add_custom_command( + TARGET genbenchmarkscripts POST_BUILD + COMMAND genbenchmarkscripts + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMENT "Generating benchmark scripts" +) diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/genbenchmarkscripts/main.c new file mode 100644 index 0000000..d7b953b --- /dev/null +++ b/samples/genbenchmarkscripts/genbenchmarkscripts/main.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include + +int +main(void) +{ + const int max_nprocs = 64; + for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) { + char filename[4096]; + sprintf(filename, "benchmark_%d.sh", nprocs); + + FILE* fp = fopen(filename, "w"); + assert(fp); + + // Boilerplate + fprintf(fp, "#!/bin/bash\n"); + fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK + fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK + fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK + fprintf(fp, "#SBATCH --mem=0\n"); // OK + fprintf(fp, "#SBATCH --partition=gpu\n"); // OK + fprintf(fp, "#SBATCH --exclusive\n"); // OK + fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK + fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs); + // HACK: exclude misconfigured nodes on Puhti + fprintf(fp, "#SBATCH -x " + "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n"); + // fprintf(fp, "#SBATCH --cpus-per-task=10\n"); + + // nprocs, nodes, gpus + const int max_gpus_per_node = 4; + const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node; + const int nodes = (int)ceil((double)nprocs / max_gpus_per_node); + fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK + fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK + fprintf(fp, "#SBATCH -N %d\n", nodes); // OK + // fprintf(fp, "#SBATCH --exclusive\n"); + // if (nprocs >= 4) + // fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); + + // Modules + // OpenMPI + fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n"); + // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); + // if (nprocs >= 32) + // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa + + // HPCX + // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); + // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 + + // Profile and run + // fprintf(fp, "mkdir -p profile_%d\n", nprocs); + + /* + const int nx = 256; // max size 2048; + const int ny = nx; + const int nz = nx; + + fprintf(fp, + //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " + //"%d\n", + "srun ./benchmark %d %d %d\n", nx, ny, nz); + */ + // fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz); + + const char* files[] = { + "benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D", + "benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm", + "benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024", + "benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4", + "benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control", + "benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default", + "benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256", + "benchmark_weak_512", + }; + for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) { + int nn = 256; + if (strcmp(files[i], "benchmark_meshsize_512") == 0) + nn = 512; + else if (strcmp(files[i], "benchmark_meshsize_1024") == 0) + nn = 1024; + else if (strcmp(files[i], "benchmark_meshsize_2048") == 0) + nn = 2048; + else if (strcmp(files[i], "benchmark_weak_128") == 0) + nn = 128; + else if (strcmp(files[i], "benchmark_weak_512") == 0) + nn = 512; + + // W/ Fredriks tunings + // (may cause Assertion `status == UCS_OK' failed errors) + // fprintf(fp, + // "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + // "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n", + // files[i], nn, nn, nn); + if (nodes >= 2) { + fprintf(fp, + "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm " + "-f core.* && cd ..)\n", + files[i], nn, nn, nn); + } + else { + fprintf(fp, + "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* " + "&& cd ..)\n", + files[i], nn, nn, nn); + } + } + + fclose(fp); + } + + return EXIT_SUCCESS; +} diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index 7be0872..d7b953b 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -17,42 +17,48 @@ main(void) // Boilerplate fprintf(fp, "#!/bin/bash\n"); - fprintf(fp, "#BATCH --job-name=astaroth\n"); - fprintf(fp, "#SBATCH --account=project_2000403\n"); - fprintf(fp, "#SBATCH --time=03:00:00\n"); - fprintf(fp, "#SBATCH --mem=32000\n"); - fprintf(fp, "#SBATCH --partition=gpu\n"); + fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK + fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK + fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK + fprintf(fp, "#SBATCH --mem=0\n"); // OK + fprintf(fp, "#SBATCH --partition=gpu\n"); // OK + fprintf(fp, "#SBATCH --exclusive\n"); // OK + fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs); + // HACK: exclude misconfigured nodes on Puhti + fprintf(fp, "#SBATCH -x " + "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n"); // fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // nprocs, nodes, gpus const int max_gpus_per_node = 4; const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node; const int nodes = (int)ceil((double)nprocs / max_gpus_per_node); - fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); - fprintf(fp, "#SBATCH -n %d\n", nprocs); - fprintf(fp, "#SBATCH -N %d\n", nodes); + fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK + fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK + fprintf(fp, "#SBATCH -N %d\n", nodes); // OK // fprintf(fp, "#SBATCH --exclusive\n"); - if (nprocs >= 4) - fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); + // if (nprocs >= 4) + // fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); // Modules // OpenMPI fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n"); - //fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa - //fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); - //if (nprocs >= 32) - // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa + // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); + // if (nprocs >= 32) + // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa // HPCX - //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); - //fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 + // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); + // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 // Profile and run // fprintf(fp, "mkdir -p profile_%d\n", nprocs); /* - const int nx = 256; // max size 1792; + const int nx = 256; // max size 2048; const int ny = nx; const int nz = nx; @@ -67,11 +73,11 @@ main(void) "benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D", "benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm", "benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024", - "benchmark_meshsize_1792", "benchmark_stencilord_2", "benchmark_stencilord_4", + "benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4", "benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control", "benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default", "benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256", - "benchmark_weak_448", + "benchmark_weak_512", }; for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) { int nn = 256; @@ -79,14 +85,32 @@ main(void) nn = 512; else if (strcmp(files[i], "benchmark_meshsize_1024") == 0) nn = 1024; - else if (strcmp(files[i], "benchmark_meshsize_1792") == 0) - nn = 1792; + else if (strcmp(files[i], "benchmark_meshsize_2048") == 0) + nn = 2048; else if (strcmp(files[i], "benchmark_weak_128") == 0) nn = 128; - else if (strcmp(files[i], "benchmark_weak_448") == 0) - nn = 448; + else if (strcmp(files[i], "benchmark_weak_512") == 0) + nn = 512; - fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn); + // W/ Fredriks tunings + // (may cause Assertion `status == UCS_OK' failed errors) + // fprintf(fp, + // "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + // "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n", + // files[i], nn, nn, nn); + if (nodes >= 2) { + fprintf(fp, + "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm " + "-f core.* && cd ..)\n", + files[i], nn, nn, nn); + } + else { + fprintf(fp, + "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* " + "&& cd ..)\n", + files[i], nn, nn, nn); + } } fclose(fp); diff --git a/scripts/buildtestcases.sh b/scripts/buildtestcases.sh new file mode 100755 index 0000000..7157656 --- /dev/null +++ b/scripts/buildtestcases.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Modules (!!!) +module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl +#module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl +#export UCX_MEMTYPE_CACHE=n # Workaround for bug in hpcx-mpi/2.5.0 + +load_default_case() { + # Pinned or RDMA + sed -i 's/#define MPI_USE_PINNED ([0-9]*)/#define MPI_USE_PINNED (0)/' src/core/device.cc + + # Stencil order + sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' acc/stdlib/stdderiv.h + sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' include/astaroth.h + + # Timings + sed -i 's/MPI_COMPUTE_ENABLED (.)/MPI_COMPUTE_ENABLED (1)/' src/core/device.cc + sed -i 's/MPI_COMM_ENABLED (.)/MPI_COMM_ENABLED (1)/' src/core/device.cc + sed -i 's/MPI_INCL_CORNERS (.)/MPI_INCL_CORNERS (0)/' src/core/device.cc + + # Decomposition + sed -i 's/MPI_DECOMPOSITION_AXES (.)/MPI_DECOMPOSITION_AXES (3)/' src/core/device.cc + + # Strong/Weak + sed -i 's/const TestType test = .*;/const TestType test = TEST_STRONG_SCALING;/' samples/benchmark/main.cc + + # Num iters + sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 1000;/' samples/benchmark/main.cc +} + +# $1 test name +# $2 grid size +create_case() { + DIR="benchmark_$1" + mkdir -p $DIR + cd $DIR + /users/pekkila/cmake/build/bin/cmake .. && make -j + cd .. +} + +# Mesh size +load_default_case +create_case "meshsize_256" +sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 100;/' samples/benchmark/main.cc +create_case "meshsize_512" +create_case "meshsize_1024" +create_case "meshsize_2048" + +# Run batch jobs +sbatch benchmark_meshsize_256/benchmark_1.sh +sbatch benchmark_meshsize_256/benchmark_2.sh +sbatch benchmark_meshsize_256/benchmark_4.sh +sbatch benchmark_meshsize_256/benchmark_8.sh +sbatch benchmark_meshsize_256/benchmark_16.sh +sbatch benchmark_meshsize_256/benchmark_32.sh +sbatch benchmark_meshsize_256/benchmark_64.sh diff --git a/scripts/postprocess_benchmarks.sh b/scripts/postprocess_benchmarks.sh new file mode 100755 index 0000000..7a60884 --- /dev/null +++ b/scripts/postprocess_benchmarks.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +OUTPUT=results.csv +rm -i $OUTPUT + +# $1 input dir +process_input() { + echo $1 + #cat $1/*.csv | sort -n + cat $1/*.csv | sort -k1n -k3n | awk '!a[$1]++' + echo "" +} >> $OUTPUT + +process_input "benchmark_decomp_1D" +process_input "benchmark_decomp_2D" +process_input "benchmark_decomp_3D" +process_input "benchmark_decomp_1D_comm" +process_input "benchmark_decomp_2D_comm" +process_input "benchmark_decomp_3D_comm" + +process_input "benchmark_meshsize_256" +process_input "benchmark_meshsize_512" +process_input "benchmark_meshsize_1024" +process_input "benchmark_meshsize_2048" + +process_input "benchmark_stencilord_2" +process_input "benchmark_stencilord_4" +process_input "benchmark_stencilord_6" +process_input "benchmark_stencilord_8" + +process_input "benchmark_timings_control" +process_input "benchmark_timings_comp" +process_input "benchmark_timings_comm" +process_input "benchmark_timings_default" +process_input "benchmark_timings_corners" + +process_input "benchmark_weak_128" +process_input "benchmark_weak_256" +process_input "benchmark_weak_512" + +cat $OUTPUT