Setup for benchmarks

This commit is contained in:
jpekkila
2020-10-28 12:55:32 +02:00
parent eb9090cc90
commit c1f2a6c340
7 changed files with 279 additions and 30 deletions

View File

@@ -11,7 +11,7 @@ project(astaroth C CXX CUDA)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
## Project-wide compilation flags
set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface
set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
set(CMAKE_C_STANDARD 11)
@@ -38,10 +38,10 @@ endif()
message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
## Options
option(DOUBLE_PRECISION "Generates double precision code." OFF)
option(DOUBLE_PRECISION "Generates double precision code." ON)
option(BUILD_SAMPLES "Builds projects in samples subdirectory." ON)
option(MPI_ENABLED "Enables additional functions for MPI communciation." OFF)
option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
option(MPI_ENABLED "Enables additional functions for MPI communciation." ON)
option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
option(VERBOSE "Enables various status and warning messages" OFF)
## Options (DEPRECATED)
@@ -110,7 +110,7 @@ if (BUILD_SAMPLES)
add_subdirectory(samples/cpptest)
add_subdirectory(samples/mpitest)
add_subdirectory(samples/benchmark)
#add_subdirectory(samples/genbenchmarkscripts)
add_subdirectory(samples/genbenchmarkscripts)
#add_subdirectory(samples/mpi_reduce_bench)
add_subdirectory(samples/fortrantest)

View File

@@ -154,7 +154,7 @@ main(int argc, char** argv)
}*/
// Percentiles
const size_t num_iters = 1000;
const size_t num_iters = 100;
const double nth_percentile = 0.90;
std::vector<double> results; // ms
results.reserve(num_iters);

View File

@@ -0,0 +1,8 @@
add_executable(genbenchmarkscripts main.c)
add_custom_command(
TARGET genbenchmarkscripts POST_BUILD
COMMAND genbenchmarkscripts
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
COMMENT "Generating benchmark scripts"
)

View File

@@ -0,0 +1,120 @@
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int
main(void)
{
const int max_nprocs = 64;
for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
char filename[4096];
sprintf(filename, "benchmark_%d.sh", nprocs);
FILE* fp = fopen(filename, "w");
assert(fp);
// Boilerplate
fprintf(fp, "#!/bin/bash\n");
fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK
fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK
fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK
fprintf(fp, "#SBATCH --mem=0\n"); // OK
fprintf(fp, "#SBATCH --partition=gpu\n"); // OK
fprintf(fp, "#SBATCH --exclusive\n"); // OK
fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK
fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
// HACK: exclude misconfigured nodes on Puhti
fprintf(fp, "#SBATCH -x "
"r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n");
// fprintf(fp, "#SBATCH --cpus-per-task=10\n");
// nprocs, nodes, gpus
const int max_gpus_per_node = 4;
const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
const int nodes = (int)ceil((double)nprocs / max_gpus_per_node);
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK
fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK
fprintf(fp, "#SBATCH -N %d\n", nodes); // OK
// fprintf(fp, "#SBATCH --exclusive\n");
// if (nprocs >= 4)
// fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
// Modules
// OpenMPI
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
// fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); //
// https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
// if (nprocs >= 32)
// fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); //
// https://www.open-mpi.org/fa
// HPCX
// fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
// fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
// Profile and run
// fprintf(fp, "mkdir -p profile_%d\n", nprocs);
/*
const int nx = 256; // max size 2048;
const int ny = nx;
const int nz = nx;
fprintf(fp,
//"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
//"%d\n",
"srun ./benchmark %d %d %d\n", nx, ny, nz);
*/
// fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
const char* files[] = {
"benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D",
"benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
"benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024",
"benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4",
"benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control",
"benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default",
"benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256",
"benchmark_weak_512",
};
for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
int nn = 256;
if (strcmp(files[i], "benchmark_meshsize_512") == 0)
nn = 512;
else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
nn = 1024;
else if (strcmp(files[i], "benchmark_meshsize_2048") == 0)
nn = 2048;
else if (strcmp(files[i], "benchmark_weak_128") == 0)
nn = 128;
else if (strcmp(files[i], "benchmark_weak_512") == 0)
nn = 512;
// W/ Fredriks tunings
// (may cause Assertion `status == UCS_OK' failed errors)
// fprintf(fp,
// "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
// "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n",
// files[i], nn, nn, nn);
if (nodes >= 2) {
fprintf(fp,
"$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
"UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm "
"-f core.* && cd ..)\n",
files[i], nn, nn, nn);
}
else {
fprintf(fp,
"$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* "
"&& cd ..)\n",
files[i], nn, nn, nn);
}
}
fclose(fp);
}
return EXIT_SUCCESS;
}

View File

@@ -17,42 +17,48 @@ main(void)
// Boilerplate
fprintf(fp, "#!/bin/bash\n");
fprintf(fp, "#BATCH --job-name=astaroth\n");
fprintf(fp, "#SBATCH --account=project_2000403\n");
fprintf(fp, "#SBATCH --time=03:00:00\n");
fprintf(fp, "#SBATCH --mem=32000\n");
fprintf(fp, "#SBATCH --partition=gpu\n");
fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK
fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK
fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK
fprintf(fp, "#SBATCH --mem=0\n"); // OK
fprintf(fp, "#SBATCH --partition=gpu\n"); // OK
fprintf(fp, "#SBATCH --exclusive\n"); // OK
fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK
fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
// HACK: exclude misconfigured nodes on Puhti
fprintf(fp, "#SBATCH -x "
"r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n");
// fprintf(fp, "#SBATCH --cpus-per-task=10\n");
// nprocs, nodes, gpus
const int max_gpus_per_node = 4;
const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
const int nodes = (int)ceil((double)nprocs / max_gpus_per_node);
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
fprintf(fp, "#SBATCH -n %d\n", nprocs);
fprintf(fp, "#SBATCH -N %d\n", nodes);
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK
fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK
fprintf(fp, "#SBATCH -N %d\n", nodes); // OK
// fprintf(fp, "#SBATCH --exclusive\n");
if (nprocs >= 4)
fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
// if (nprocs >= 4)
// fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
// Modules
// OpenMPI
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
//fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
//fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
//if (nprocs >= 32)
// fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
// fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); //
// https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
// if (nprocs >= 32)
// fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); //
// https://www.open-mpi.org/fa
// HPCX
//fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
//fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
// fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
// fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0
// Profile and run
// fprintf(fp, "mkdir -p profile_%d\n", nprocs);
/*
const int nx = 256; // max size 1792;
const int nx = 256; // max size 2048;
const int ny = nx;
const int nz = nx;
@@ -67,11 +73,11 @@ main(void)
"benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D",
"benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
"benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024",
"benchmark_meshsize_1792", "benchmark_stencilord_2", "benchmark_stencilord_4",
"benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4",
"benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control",
"benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default",
"benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256",
"benchmark_weak_448",
"benchmark_weak_512",
};
for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
int nn = 256;
@@ -79,14 +85,32 @@ main(void)
nn = 512;
else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
nn = 1024;
else if (strcmp(files[i], "benchmark_meshsize_1792") == 0)
nn = 1792;
else if (strcmp(files[i], "benchmark_meshsize_2048") == 0)
nn = 2048;
else if (strcmp(files[i], "benchmark_weak_128") == 0)
nn = 128;
else if (strcmp(files[i], "benchmark_weak_448") == 0)
nn = 448;
else if (strcmp(files[i], "benchmark_weak_512") == 0)
nn = 512;
fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn);
// W/ Fredriks tunings
// (may cause Assertion `status == UCS_OK' failed errors)
// fprintf(fp,
// "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
// "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n",
// files[i], nn, nn, nn);
if (nodes >= 2) {
fprintf(fp,
"$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
"UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm "
"-f core.* && cd ..)\n",
files[i], nn, nn, nn);
}
else {
fprintf(fp,
"$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* "
"&& cd ..)\n",
files[i], nn, nn, nn);
}
}
fclose(fp);

56
scripts/buildtestcases.sh Executable file
View File

@@ -0,0 +1,56 @@
#!/bin/bash
# Modules (!!!)
module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl
#module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl
#export UCX_MEMTYPE_CACHE=n # Workaround for bug in hpcx-mpi/2.5.0
load_default_case() {
# Pinned or RDMA
sed -i 's/#define MPI_USE_PINNED ([0-9]*)/#define MPI_USE_PINNED (0)/' src/core/device.cc
# Stencil order
sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' acc/stdlib/stdderiv.h
sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' include/astaroth.h
# Timings
sed -i 's/MPI_COMPUTE_ENABLED (.)/MPI_COMPUTE_ENABLED (1)/' src/core/device.cc
sed -i 's/MPI_COMM_ENABLED (.)/MPI_COMM_ENABLED (1)/' src/core/device.cc
sed -i 's/MPI_INCL_CORNERS (.)/MPI_INCL_CORNERS (0)/' src/core/device.cc
# Decomposition
sed -i 's/MPI_DECOMPOSITION_AXES (.)/MPI_DECOMPOSITION_AXES (3)/' src/core/device.cc
# Strong/Weak
sed -i 's/const TestType test = .*;/const TestType test = TEST_STRONG_SCALING;/' samples/benchmark/main.cc
# Num iters
sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 1000;/' samples/benchmark/main.cc
}
# $1 test name
# $2 grid size
create_case() {
DIR="benchmark_$1"
mkdir -p $DIR
cd $DIR
/users/pekkila/cmake/build/bin/cmake .. && make -j
cd ..
}
# Mesh size
load_default_case
create_case "meshsize_256"
sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 100;/' samples/benchmark/main.cc
create_case "meshsize_512"
create_case "meshsize_1024"
create_case "meshsize_2048"
# Run batch jobs
sbatch benchmark_meshsize_256/benchmark_1.sh
sbatch benchmark_meshsize_256/benchmark_2.sh
sbatch benchmark_meshsize_256/benchmark_4.sh
sbatch benchmark_meshsize_256/benchmark_8.sh
sbatch benchmark_meshsize_256/benchmark_16.sh
sbatch benchmark_meshsize_256/benchmark_32.sh
sbatch benchmark_meshsize_256/benchmark_64.sh

View File

@@ -0,0 +1,41 @@
#!/bin/bash
OUTPUT=results.csv
rm -i $OUTPUT
# $1 input dir
process_input() {
echo $1
#cat $1/*.csv | sort -n
cat $1/*.csv | sort -k1n -k3n | awk '!a[$1]++'
echo ""
} >> $OUTPUT
process_input "benchmark_decomp_1D"
process_input "benchmark_decomp_2D"
process_input "benchmark_decomp_3D"
process_input "benchmark_decomp_1D_comm"
process_input "benchmark_decomp_2D_comm"
process_input "benchmark_decomp_3D_comm"
process_input "benchmark_meshsize_256"
process_input "benchmark_meshsize_512"
process_input "benchmark_meshsize_1024"
process_input "benchmark_meshsize_2048"
process_input "benchmark_stencilord_2"
process_input "benchmark_stencilord_4"
process_input "benchmark_stencilord_6"
process_input "benchmark_stencilord_8"
process_input "benchmark_timings_control"
process_input "benchmark_timings_comp"
process_input "benchmark_timings_comm"
process_input "benchmark_timings_default"
process_input "benchmark_timings_corners"
process_input "benchmark_weak_128"
process_input "benchmark_weak_256"
process_input "benchmark_weak_512"
cat $OUTPUT