From fb41741d74e33488dd0e94d9249c7c0654c219eb Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 7 Apr 2020 17:58:47 +0300 Subject: [PATCH] Improvements to samples --- samples/benchmark/main.cc | 86 ++++++++++++++++++++++++++++------- samples/bwtest/CMakeLists.txt | 2 +- samples/bwtest/main.c | 5 +- src/core/device.cc | 5 +- src/core/kernels/kernels.h | 4 +- 5 files changed, 79 insertions(+), 23 deletions(-) diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index 4b50bfb..5ab4349 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -89,33 +89,45 @@ main(void) } }*/ + /* + // Basic + const size_t num_iters = 100; + // Warmup - for (size_t i = 0; i < 10; ++i) + for (size_t i = 0; i < num_iters / 10; ++i) acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON); // Benchmark Timer t; - const AcReal dt = FLT_EPSILON; + const AcReal dt = FLT_EPSILON; - acGridSynchronizeStream(STREAM_ALL); - timer_reset(&t); - acGridSynchronizeStream(STREAM_ALL); + acGridSynchronizeStream(STREAM_ALL); + timer_reset(&t); + acGridSynchronizeStream(STREAM_ALL); - const size_t num_iters = 50; - for (size_t i = 0; i < num_iters; ++i) - acGridIntegrate(STREAM_DEFAULT, dt); + for (size_t i = 0; i < num_iters; ++i) + acGridIntegrate(STREAM_DEFAULT, dt); - acGridSynchronizeStream(STREAM_ALL); - if (!pid) - timer_diff_print(t); - acGridSynchronizeStream(STREAM_ALL); - /* + acGridSynchronizeStream(STREAM_ALL); + if (!pid) + timer_diff_print(t); + acGridSynchronizeStream(STREAM_ALL); + */ + + // Percentiles const size_t num_iters = 100; const double nth_percentile = 0.90; - std::vector results; // ms results.reserve(num_iters); + // Warmup + for (size_t i = 0; i < num_iters / 10; ++i) + acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON); + + // Benchmark + Timer t; + const AcReal dt = FLT_EPSILON; + for (size_t i = 0; i < num_iters; ++i) { acGridSynchronizeStream(STREAM_ALL); timer_reset(&t); @@ -123,9 +135,9 @@ main(void) acGridIntegrate(STREAM_DEFAULT, dt); acGridSynchronizeStream(STREAM_ALL); results.push_back(timer_diff_nsec(t) / 1e6); + acGridSynchronizeStream(STREAM_ALL); } - // Write benchmark to file if (!pid) { std::sort(results.begin(), results.end(), [](const double& a, const double& b) { return a < b; }); @@ -149,7 +161,49 @@ main(void) fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]); fclose(fp); - }*/ + } + + /* +const size_t num_iters = 100; +const double nth_percentile = 0.90; + +std::vector results; // ms +results.reserve(num_iters); + +for (size_t i = 0; i < num_iters; ++i) { + acGridSynchronizeStream(STREAM_ALL); + timer_reset(&t); + acGridSynchronizeStream(STREAM_ALL); + acGridIntegrate(STREAM_DEFAULT, dt); + acGridSynchronizeStream(STREAM_ALL); + results.push_back(timer_diff_nsec(t) / 1e6); +} + +// Write benchmark to file +if (!pid) { + std::sort(results.begin(), results.end(), + [](const double& a, const double& b) { return a < b; }); + fprintf(stdout, + "Integration step time %g ms (%gth " + "percentile)--------------------------------------\n", + results[nth_percentile * num_iters], 100 * nth_percentile); + + char path[4096] = ""; + if (test == TEST_STRONG_SCALING) + strncpy(path, "strong_scaling.csv", sizeof(path)); + else if (test == TEST_WEAK_SCALING) + strncpy(path, "weak_scaling.csv", sizeof(path)); + else + ERROR("Invalid test type"); + + FILE* fp = fopen(path, "a"); + ERRCHK_ALWAYS(fp); + // Format + // nprocs, measured (ms) + fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]); + + fclose(fp); +}*/ acGridQuit(); MPI_Finalize(); diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt index 13bf13f..cd4329f 100644 --- a/samples/bwtest/CMakeLists.txt +++ b/samples/bwtest/CMakeLists.txt @@ -5,5 +5,5 @@ find_package(OpenMP) find_package(CUDAToolkit) add_executable(bwtest main.c) -add_compile_options(-O3) target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static) +target_compile_options(bwtest PRIVATE -O3) diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c index 00c0caf..7f1f9f6 100644 --- a/samples/bwtest/main.c +++ b/samples/bwtest/main.c @@ -303,6 +303,7 @@ main(void) PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024)); +#if 0 { uint8_t* src = allocHost(BLOCK_SIZE); uint8_t* dst = allocHost(BLOCK_SIZE); @@ -362,7 +363,7 @@ main(void) freeDevice(dst); } PRINT("\n------------------------\n"); - /* +#else { // Final run for easy identification with the profiler uint8_t* src = allocDevice(BLOCK_SIZE); uint8_t* dst = allocDevice(BLOCK_SIZE); @@ -373,7 +374,7 @@ main(void) freeDevice(src); freeDevice(dst); } - */ +#endif MPI_Finalize(); return EXIT_SUCCESS; diff --git a/src/core/device.cc b/src/core/device.cc index 1b3a284..8ce057f 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -650,7 +650,7 @@ acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddat static void acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata) { - if (!ddata->pinned) + if (!ddata->pinned) // Unpin iff the data was pinned previously return; cudaSetDevice(device->id); @@ -1362,7 +1362,8 @@ acTransferCommData(const Device device, // const int npid = getPid(pid3d + neighbor, decomp); cudaStreamSynchronize(data->streams[a_idx]); - MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD, + MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD, + &data->send_reqs[b_idx]); } } } diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h index fbf24bc..805cbed 100644 --- a/src/core/kernels/kernels.h +++ b/src/core/kernels/kernels.h @@ -15,8 +15,8 @@ typedef struct { #if (AC_MPI_ENABLED && AC_MPI_RT_PINNING) AcReal* data_pinned; - bool pinned; // Set if data was received to pinned memory -#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING) + bool pinned = false; // Set if data was received to pinned memory +#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING) #if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM) MPI_Win win; // MPI window for RMA