Added some final changes to benchmarking

This commit is contained in:
jpekkila
2019-12-15 21:47:41 +02:00
parent 8bd81db63c
commit ecff5c3041
2 changed files with 39 additions and 10 deletions

View File

@@ -1008,8 +1008,8 @@ acDeviceRunMPITest(void)
acLoadConfig(AC_DEFAULT_CONFIG, &info); acLoadConfig(AC_DEFAULT_CONFIG, &info);
// Large mesh dim // Large mesh dim
const int nn = 128; const int nn = 512;
const int num_iters = 10; const int num_iters = 100;
info.int_params[AC_nx] = info.int_params[AC_ny] = nn; info.int_params[AC_nx] = info.int_params[AC_ny] = nn;
info.int_params[AC_nz] = BENCH_STRONG_SCALING ? nn : nn * num_processes; info.int_params[AC_nz] = BENCH_STRONG_SCALING ? nn : nn * num_processes;
info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx]; info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx];
@@ -1064,6 +1064,14 @@ acDeviceRunMPITest(void)
acDeviceCreate(pid % devices_per_node, submesh_info, &device); acDeviceCreate(pid % devices_per_node, submesh_info, &device);
acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); acDeviceLoadMesh(device, STREAM_DEFAULT, submesh);
// Enable peer access
MPI_Barrier(MPI_COMM_WORLD);
const int front = (device->id + 1) % devices_per_node;
const int back = (device->id + devices_per_node - 1) % devices_per_node;
cudaSetDevice(device->id);
WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
// Verification start /////////////////////////////////////////////////////////////////////// // Verification start ///////////////////////////////////////////////////////////////////////
#if BENCH_STRONG_SCALING #if BENCH_STRONG_SCALING
{ {
@@ -1094,6 +1102,13 @@ acDeviceRunMPITest(void)
#endif #endif
// Verification end /////////////////////////////////////////////////////////////////////// // Verification end ///////////////////////////////////////////////////////////////////////
// Warmup
for (int i = 0; i < 10; ++i)
acDeviceIntegrateStepMPI(device, 0);
acDeviceSynchronizeStream(device, STREAM_ALL);
MPI_Barrier(MPI_COMM_WORLD);
// Benchmark start /////////////////////////////////////////////////////////////////////// // Benchmark start ///////////////////////////////////////////////////////////////////////
std::vector<double> results; std::vector<double> results;
results.reserve(num_iters); results.reserve(num_iters);
@@ -1114,7 +1129,7 @@ acDeviceRunMPITest(void)
} }
const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; const double ms_elapsed = timer_diff_nsec(total_time) / 1e6;
const double nth_percentile = 0.95; const double nth_percentile = 0.90;
std::sort(results.begin(), results.end(), std::sort(results.begin(), results.end(),
[](const double& a, const double& b) { return a < b; }); [](const double& a, const double& b) { return a < b; });

View File

@@ -49,14 +49,15 @@ smaller_than(const double& a, const double& b)
int int
run_benchmark(const char* config_path) run_benchmark(const char* config_path)
{ {
const int nn = 256; const int nn = 512;
const int num_iters = 100; const int num_iters = 100;
#define BENCH_STRONG_SCALING (1)
const int num_processes = acGetNumDevicesPerNode();
AcMeshInfo mesh_info; AcMeshInfo mesh_info;
load_config(config_path, &mesh_info); load_config(config_path, &mesh_info);
mesh_info.int_params[AC_nx] = nn; mesh_info.int_params[AC_nx] = mesh_info.int_params[AC_ny] = nn;
mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx]; mesh_info.int_params[AC_nz] = BENCH_STRONG_SCALING ? nn : nn * num_processes;
mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx];
update_config(&mesh_info); update_config(&mesh_info);
AcMesh* mesh = acmesh_create(mesh_info); AcMesh* mesh = acmesh_create(mesh_info);
@@ -74,6 +75,7 @@ run_benchmark(const char* config_path)
} }
acSynchronize(); acSynchronize();
const AcReal dt = FLT_EPSILON; const AcReal dt = FLT_EPSILON;
printf("Using dt = %g\n", dt);
Timer total_time; Timer total_time;
timer_reset(&total_time); timer_reset(&total_time);
@@ -89,13 +91,25 @@ run_benchmark(const char* config_path)
} }
acSynchronize(); acSynchronize();
const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; const double ms_elapsed = timer_diff_nsec(total_time) / 1e6;
const double nth_percentile = 0.95; const double nth_percentile = 0.90;
std::sort(results.begin(), results.end(), smaller_than); std::sort(results.begin(), results.end(), smaller_than);
printf("vertices: %d^3, iterations: %d\n", nn, num_iters); printf("vertices: %d^3, iterations: %d\n", nn, num_iters);
printf("Total time: %f ms\n", ms_elapsed); printf("Total time: %f ms\n", ms_elapsed);
printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), printf("Time per step: %f ms\n", ms_elapsed / num_iters);
results[int(nth_percentile * num_iters)]);
const size_t nth_index = int(nth_percentile * num_iters);
printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), results[nth_index]);
// Write out
char buf[256];
sprintf(buf, "nprocs_%d_result_%s.bench", num_processes,
BENCH_STRONG_SCALING ? "strong" : "weak");
FILE* fp = fopen(buf, "w");
ERRCHK_ALWAYS(fp);
fprintf(fp, "num_processes, percentile (%dth)\n", int(100 * nth_percentile));
fprintf(fp, "%d, %g\n", num_processes, results[nth_index]);
fclose(fp);
acQuit(); acQuit();
acmesh_destroy(mesh); acmesh_destroy(mesh);