Improvements to samples
This commit is contained in:
@@ -89,33 +89,45 @@ main(void)
|
|||||||
}
|
}
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
// Basic
|
||||||
|
const size_t num_iters = 100;
|
||||||
|
|
||||||
// Warmup
|
// Warmup
|
||||||
for (size_t i = 0; i < 10; ++i)
|
for (size_t i = 0; i < num_iters / 10; ++i)
|
||||||
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
|
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
|
||||||
|
|
||||||
// Benchmark
|
// Benchmark
|
||||||
Timer t;
|
Timer t;
|
||||||
const AcReal dt = FLT_EPSILON;
|
const AcReal dt = FLT_EPSILON;
|
||||||
|
|
||||||
acGridSynchronizeStream(STREAM_ALL);
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
timer_reset(&t);
|
timer_reset(&t);
|
||||||
acGridSynchronizeStream(STREAM_ALL);
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
|
|
||||||
const size_t num_iters = 50;
|
for (size_t i = 0; i < num_iters; ++i)
|
||||||
for (size_t i = 0; i < num_iters; ++i)
|
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||||
acGridIntegrate(STREAM_DEFAULT, dt);
|
|
||||||
|
|
||||||
acGridSynchronizeStream(STREAM_ALL);
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
if (!pid)
|
if (!pid)
|
||||||
timer_diff_print(t);
|
timer_diff_print(t);
|
||||||
acGridSynchronizeStream(STREAM_ALL);
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
/*
|
*/
|
||||||
|
|
||||||
|
// Percentiles
|
||||||
const size_t num_iters = 100;
|
const size_t num_iters = 100;
|
||||||
const double nth_percentile = 0.90;
|
const double nth_percentile = 0.90;
|
||||||
|
|
||||||
std::vector<double> results; // ms
|
std::vector<double> results; // ms
|
||||||
results.reserve(num_iters);
|
results.reserve(num_iters);
|
||||||
|
|
||||||
|
// Warmup
|
||||||
|
for (size_t i = 0; i < num_iters / 10; ++i)
|
||||||
|
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
Timer t;
|
||||||
|
const AcReal dt = FLT_EPSILON;
|
||||||
|
|
||||||
for (size_t i = 0; i < num_iters; ++i) {
|
for (size_t i = 0; i < num_iters; ++i) {
|
||||||
acGridSynchronizeStream(STREAM_ALL);
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
timer_reset(&t);
|
timer_reset(&t);
|
||||||
@@ -123,9 +135,9 @@ main(void)
|
|||||||
acGridIntegrate(STREAM_DEFAULT, dt);
|
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||||
acGridSynchronizeStream(STREAM_ALL);
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
results.push_back(timer_diff_nsec(t) / 1e6);
|
results.push_back(timer_diff_nsec(t) / 1e6);
|
||||||
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write benchmark to file
|
|
||||||
if (!pid) {
|
if (!pid) {
|
||||||
std::sort(results.begin(), results.end(),
|
std::sort(results.begin(), results.end(),
|
||||||
[](const double& a, const double& b) { return a < b; });
|
[](const double& a, const double& b) { return a < b; });
|
||||||
@@ -149,7 +161,49 @@ main(void)
|
|||||||
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
|
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
|
||||||
|
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}*/
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
const size_t num_iters = 100;
|
||||||
|
const double nth_percentile = 0.90;
|
||||||
|
|
||||||
|
std::vector<double> results; // ms
|
||||||
|
results.reserve(num_iters);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < num_iters; ++i) {
|
||||||
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
|
timer_reset(&t);
|
||||||
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
|
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||||
|
acGridSynchronizeStream(STREAM_ALL);
|
||||||
|
results.push_back(timer_diff_nsec(t) / 1e6);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write benchmark to file
|
||||||
|
if (!pid) {
|
||||||
|
std::sort(results.begin(), results.end(),
|
||||||
|
[](const double& a, const double& b) { return a < b; });
|
||||||
|
fprintf(stdout,
|
||||||
|
"Integration step time %g ms (%gth "
|
||||||
|
"percentile)--------------------------------------\n",
|
||||||
|
results[nth_percentile * num_iters], 100 * nth_percentile);
|
||||||
|
|
||||||
|
char path[4096] = "";
|
||||||
|
if (test == TEST_STRONG_SCALING)
|
||||||
|
strncpy(path, "strong_scaling.csv", sizeof(path));
|
||||||
|
else if (test == TEST_WEAK_SCALING)
|
||||||
|
strncpy(path, "weak_scaling.csv", sizeof(path));
|
||||||
|
else
|
||||||
|
ERROR("Invalid test type");
|
||||||
|
|
||||||
|
FILE* fp = fopen(path, "a");
|
||||||
|
ERRCHK_ALWAYS(fp);
|
||||||
|
// Format
|
||||||
|
// nprocs, measured (ms)
|
||||||
|
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
}*/
|
||||||
|
|
||||||
acGridQuit();
|
acGridQuit();
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
|
@@ -5,5 +5,5 @@ find_package(OpenMP)
|
|||||||
find_package(CUDAToolkit)
|
find_package(CUDAToolkit)
|
||||||
|
|
||||||
add_executable(bwtest main.c)
|
add_executable(bwtest main.c)
|
||||||
add_compile_options(-O3)
|
|
||||||
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
|
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
|
||||||
|
target_compile_options(bwtest PRIVATE -O3)
|
||||||
|
@@ -303,6 +303,7 @@ main(void)
|
|||||||
|
|
||||||
PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
|
PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
|
||||||
|
|
||||||
|
#if 0
|
||||||
{
|
{
|
||||||
uint8_t* src = allocHost(BLOCK_SIZE);
|
uint8_t* src = allocHost(BLOCK_SIZE);
|
||||||
uint8_t* dst = allocHost(BLOCK_SIZE);
|
uint8_t* dst = allocHost(BLOCK_SIZE);
|
||||||
@@ -362,7 +363,7 @@ main(void)
|
|||||||
freeDevice(dst);
|
freeDevice(dst);
|
||||||
}
|
}
|
||||||
PRINT("\n------------------------\n");
|
PRINT("\n------------------------\n");
|
||||||
/*
|
#else
|
||||||
{ // Final run for easy identification with the profiler
|
{ // Final run for easy identification with the profiler
|
||||||
uint8_t* src = allocDevice(BLOCK_SIZE);
|
uint8_t* src = allocDevice(BLOCK_SIZE);
|
||||||
uint8_t* dst = allocDevice(BLOCK_SIZE);
|
uint8_t* dst = allocDevice(BLOCK_SIZE);
|
||||||
@@ -373,7 +374,7 @@ main(void)
|
|||||||
freeDevice(src);
|
freeDevice(src);
|
||||||
freeDevice(dst);
|
freeDevice(dst);
|
||||||
}
|
}
|
||||||
*/
|
#endif
|
||||||
|
|
||||||
MPI_Finalize();
|
MPI_Finalize();
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
|
@@ -650,7 +650,7 @@ acPinPackedData(const Device device, const cudaStream_t stream, PackedData* ddat
|
|||||||
static void
|
static void
|
||||||
acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
|
acUnpinPackedData(const Device device, const cudaStream_t stream, PackedData* ddata)
|
||||||
{
|
{
|
||||||
if (!ddata->pinned)
|
if (!ddata->pinned) // Unpin iff the data was pinned previously
|
||||||
return;
|
return;
|
||||||
|
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
@@ -1362,7 +1362,8 @@ acTransferCommData(const Device device, //
|
|||||||
const int npid = getPid(pid3d + neighbor, decomp);
|
const int npid = getPid(pid3d + neighbor, decomp);
|
||||||
|
|
||||||
cudaStreamSynchronize(data->streams[a_idx]);
|
cudaStreamSynchronize(data->streams[a_idx]);
|
||||||
MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
|
MPI_Isend(src->data, count, datatype, npid, b_idx, MPI_COMM_WORLD,
|
||||||
|
&data->send_reqs[b_idx]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -15,8 +15,8 @@ typedef struct {
|
|||||||
|
|
||||||
#if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
|
#if (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
|
||||||
AcReal* data_pinned;
|
AcReal* data_pinned;
|
||||||
bool pinned; // Set if data was received to pinned memory
|
bool pinned = false; // Set if data was received to pinned memory
|
||||||
#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
|
#endif // (AC_MPI_ENABLED && AC_MPI_RT_PINNING)
|
||||||
|
|
||||||
#if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
|
#if (AC_MPI_ENABLED && AC_MPI_UNIDIRECTIONAL_COMM)
|
||||||
MPI_Win win; // MPI window for RMA
|
MPI_Win win; // MPI window for RMA
|
||||||
|
Reference in New Issue
Block a user