From 9840b817d081339cc04272225a7505bad3b4b0c0 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Sun, 7 Jun 2020 21:59:33 +0300 Subject: [PATCH] Added the (hopefully final) basic test case used for the benchmarks --- samples/benchmark/main.cc | 7 ++-- samples/genbenchmarkscripts/main.c | 9 ++--- src/core/device.cc | 57 +++++++++++++++++++++--------- src/core/kernels/integration.cuh | 2 +- 4 files changed, 50 insertions(+), 25 deletions(-) diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index dd14129..962a316 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -56,11 +56,12 @@ morton3D(const uint64_t pid) { uint64_t i, j, k; i = j = k = 0; + for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 3 * bit; - i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; + k |= ((pid & (mask << 0)) >> 2 * bit) >> 0; j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; - k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; + i |= ((pid & (mask << 2)) >> 2 * bit) >> 2; } return (uint3_64){i, j, k}; @@ -174,7 +175,7 @@ main(int argc, char** argv) */ // Percentiles - const size_t num_iters = 100; + const size_t num_iters = 1000; const double nth_percentile = 0.90; std::vector results; // ms results.reserve(num_iters); diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index 6f160b3..a45bf1a 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -29,6 +29,7 @@ main(void) fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); fprintf(fp, "#SBATCH -n %d\n", nprocs); fprintf(fp, "#SBATCH -N %d\n", nodes); + fprintf(fp, "#SBATCH --exclusive\n"); // Modules fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); @@ -37,13 +38,13 @@ main(void) // Profile and run fprintf(fp, "mkdir -p profile_%d\n", nprocs); - const int nx = 1792; + const int nx = 256; // max size 1792; const int ny = nx; const int nz = nx; fprintf(fp, - "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " - "%d\n", - nprocs, nx, ny, nz); + //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " + //"%d\n", + "srun ./benchmark %d %d %d\n", nx, ny, nz); fclose(fp); } diff --git a/src/core/device.cc b/src/core/device.cc index 1e070cb..35af82d 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -527,6 +527,15 @@ morton3D(const uint64_t pid) i = j = k = 0; if (DECOMPOSITION_AXES == 3) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 3 * bit; + k |= ((pid & (mask << 0)) >> 2 * bit) >> 0; + j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; + i |= ((pid & (mask << 2)) >> 2 * bit) >> 2; + } + } + /* + else if (DECOMPOSITION_AXES == 3) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 3 * bit; i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; @@ -534,18 +543,19 @@ morton3D(const uint64_t pid) k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; } } + */ // Just a quick copy/paste for other decomp dims else if (DECOMPOSITION_AXES == 2) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 2 * bit; - i |= ((pid & (mask << 0)) >> 1 * bit) >> 0; - j |= ((pid & (mask << 1)) >> 1 * bit) >> 1; + j |= ((pid & (mask << 0)) >> 1 * bit) >> 0; + k |= ((pid & (mask << 1)) >> 1 * bit) >> 1; } } else if (DECOMPOSITION_AXES == 1) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 1 * bit; - i |= ((pid & (mask << 0)) >> 0 * bit) >> 0; + k |= ((pid & (mask << 0)) >> 0 * bit) >> 0; } } else { @@ -562,24 +572,33 @@ morton1D(const uint3_64 pid) uint64_t i = 0; if (DECOMPOSITION_AXES == 3) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << bit; + i |= ((pid.z & mask) << 0) << 2 * bit; + i |= ((pid.y & mask) << 1) << 2 * bit; + i |= ((pid.x & mask) << 2) << 2 * bit; + } + } + /* + else if (DECOMPOSITION_AXES == 3) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; i |= ((pid.x & mask) << 0) << 2 * bit; i |= ((pid.y & mask) << 1) << 2 * bit; i |= ((pid.z & mask) << 2) << 2 * bit; } - } + }*/ else if (DECOMPOSITION_AXES == 2) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; - i |= ((pid.x & mask) << 0) << 1 * bit; - i |= ((pid.y & mask) << 1) << 1 * bit; + i |= ((pid.y & mask) << 0) << 1 * bit; + i |= ((pid.z & mask) << 1) << 1 * bit; } } else if (DECOMPOSITION_AXES == 1) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; - i |= ((pid.x & mask) << 0) << 0 * bit; + i |= ((pid.z & mask) << 0) << 0 * bit; } } else { @@ -1204,6 +1223,8 @@ typedef struct { CommData sidexy_data; CommData sidexz_data; CommData sideyz_data; + + // int comm_cart; } Grid; static Grid grid = {}; @@ -1444,16 +1465,6 @@ acGridIntegrate(const Stream stream, const AcReal dt) acPackCommData(device, sideyz_b0s, &sideyz_data); #endif -#if MPI_COMPUTE_ENABLED - //////////// INNER INTEGRATION ////////////// - { - const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = nn; - acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); - } -//////////////////////////////////////////// -#endif // MPI_COMPUTE_ENABLED - #if MPI_COMM_ENABLED MPI_Barrier(MPI_COMM_WORLD); @@ -1474,7 +1485,19 @@ acGridIntegrate(const Stream stream, const AcReal dt) acTransferCommData(device, sidexy_b0s, &sidexy_data); acTransferCommData(device, sidexz_b0s, &sidexz_data); acTransferCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMM_ENABLED +#if MPI_COMPUTE_ENABLED + //////////// INNER INTEGRATION ////////////// + { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = nn; + acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); + } +//////////////////////////////////////////// +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED // acTransferCommDataWait(corner_data); // Do not rm: required for corners acTransferCommDataWait(edgex_data); acTransferCommDataWait(edgey_data); diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh index 97326ad..4c01148 100644 --- a/src/core/kernels/integration.cuh +++ b/src/core/kernels/integration.cuh @@ -134,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr // RK3 dim3 best_dims(0, 0, 0); float best_time = INFINITY; - const int num_iterations = 5; + const int num_iterations = 10; for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {