diff --git a/src/core/device.cc b/src/core/device.cc index 63708fd..39e6c36 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -1008,8 +1008,8 @@ acDeviceRunMPITest(void) acLoadConfig(AC_DEFAULT_CONFIG, &info); // Large mesh dim - const int nn = 128; - const int num_iters = 10; + const int nn = 512; + const int num_iters = 100; info.int_params[AC_nx] = info.int_params[AC_ny] = nn; info.int_params[AC_nz] = BENCH_STRONG_SCALING ? nn : nn * num_processes; info.real_params[AC_inv_dsx] = AcReal(1.0) / info.real_params[AC_dsx]; @@ -1064,6 +1064,14 @@ acDeviceRunMPITest(void) acDeviceCreate(pid % devices_per_node, submesh_info, &device); acDeviceLoadMesh(device, STREAM_DEFAULT, submesh); + // Enable peer access + MPI_Barrier(MPI_COMM_WORLD); + const int front = (device->id + 1) % devices_per_node; + const int back = (device->id + devices_per_node - 1) % devices_per_node; + cudaSetDevice(device->id); + WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0)); + WARNCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0)); + // Verification start /////////////////////////////////////////////////////////////////////// #if BENCH_STRONG_SCALING { @@ -1094,6 +1102,13 @@ acDeviceRunMPITest(void) #endif // Verification end /////////////////////////////////////////////////////////////////////// + // Warmup + for (int i = 0; i < 10; ++i) + acDeviceIntegrateStepMPI(device, 0); + + acDeviceSynchronizeStream(device, STREAM_ALL); + MPI_Barrier(MPI_COMM_WORLD); + // Benchmark start /////////////////////////////////////////////////////////////////////// std::vector results; results.reserve(num_iters); @@ -1114,7 +1129,7 @@ acDeviceRunMPITest(void) } const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; - const double nth_percentile = 0.95; + const double nth_percentile = 0.90; std::sort(results.begin(), results.end(), [](const double& a, const double& b) { return a < b; }); diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc index f8dcdf2..976e745 100644 --- a/src/standalone/benchmark.cc +++ b/src/standalone/benchmark.cc @@ -49,14 +49,15 @@ smaller_than(const double& a, const double& b) int run_benchmark(const char* config_path) { - const int nn = 256; + const int nn = 512; const int num_iters = 100; +#define BENCH_STRONG_SCALING (1) + const int num_processes = acGetNumDevicesPerNode(); AcMeshInfo mesh_info; load_config(config_path, &mesh_info); - mesh_info.int_params[AC_nx] = nn; - mesh_info.int_params[AC_ny] = mesh_info.int_params[AC_nx]; - mesh_info.int_params[AC_nz] = mesh_info.int_params[AC_nx]; + mesh_info.int_params[AC_nx] = mesh_info.int_params[AC_ny] = nn; + mesh_info.int_params[AC_nz] = BENCH_STRONG_SCALING ? nn : nn * num_processes; update_config(&mesh_info); AcMesh* mesh = acmesh_create(mesh_info); @@ -74,6 +75,7 @@ run_benchmark(const char* config_path) } acSynchronize(); const AcReal dt = FLT_EPSILON; + printf("Using dt = %g\n", dt); Timer total_time; timer_reset(&total_time); @@ -89,13 +91,25 @@ run_benchmark(const char* config_path) } acSynchronize(); const double ms_elapsed = timer_diff_nsec(total_time) / 1e6; - const double nth_percentile = 0.95; + const double nth_percentile = 0.90; std::sort(results.begin(), results.end(), smaller_than); printf("vertices: %d^3, iterations: %d\n", nn, num_iters); printf("Total time: %f ms\n", ms_elapsed); - printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), - results[int(nth_percentile * num_iters)]); + printf("Time per step: %f ms\n", ms_elapsed / num_iters); + + const size_t nth_index = int(nth_percentile * num_iters); + printf("%dth percentile per step: %f ms\n", int(100 * nth_percentile), results[nth_index]); + + // Write out + char buf[256]; + sprintf(buf, "nprocs_%d_result_%s.bench", num_processes, + BENCH_STRONG_SCALING ? "strong" : "weak"); + FILE* fp = fopen(buf, "w"); + ERRCHK_ALWAYS(fp); + fprintf(fp, "num_processes, percentile (%dth)\n", int(100 * nth_percentile)); + fprintf(fp, "%d, %g\n", num_processes, results[nth_index]); + fclose(fp); acQuit(); acmesh_destroy(mesh);