From 0d80834619fa973052686fbf887dcca1fa664f23 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 2 Jun 2020 14:08:34 +0300 Subject: [PATCH 01/10] Disabled forcing and upwinding for performance tests. Set default grid size to 512^3. Set default cmake params s.t. benchmarks can be reproduced out-of-the-box. --- CMakeLists.txt | 10 +++++----- acc/mhd_solver/stencil_kernel.ac | 4 ++-- config/astaroth.conf | 6 +++--- src/utils/modelsolver.c | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5a514b..45c3a2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,11 +30,11 @@ endif() message(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) ## Options -option(DOUBLE_PRECISION "Generates double precision code." OFF) -option(BUILD_SAMPLES "Builds projects in samples subdirectory." OFF) -option(BUILD_STANDALONE "Builds standalone Astaroth." ON) -option(MPI_ENABLED "Enables additional functions for MPI communciation." OFF) -option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON) +option(DOUBLE_PRECISION "Generates double precision code." ON) +option(BUILD_SAMPLES "Builds projects in samples subdirectory." ON) +option(BUILD_STANDALONE "Builds standalone Astaroth." OFF) +option(MPI_ENABLED "Enables additional functions for MPI communciation." ON) +option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF) ## Options (DEPRECATED) # option(BUILD_DEBUG "Builds the program with extensive error checking" OFF) diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac index 905eb65..0f37ea5 100644 --- a/acc/mhd_solver/stencil_kernel.ac +++ b/acc/mhd_solver/stencil_kernel.ac @@ -5,8 +5,8 @@ #define LMAGNETIC (1) #define LENTROPY (1) #define LTEMPERATURE (0) -#define LFORCING (1) -#define LUPWD (1) +#define LFORCING (0) +#define LUPWD (0) #define LSINK (0) #define AC_THERMAL_CONDUCTIVITY (0.001) // TODO: make an actual config parameter diff --git a/config/astaroth.conf b/config/astaroth.conf index abc1613..ccefc45 100644 --- a/config/astaroth.conf +++ b/config/astaroth.conf @@ -5,9 +5,9 @@ * "Compile-time" params * ============================================================================= */ -AC_nx = 128 -AC_ny = 128 -AC_nz = 128 +AC_nx = 512 +AC_ny = 512 +AC_nz = 512 AC_dsx = 0.04908738521 AC_dsy = 0.04908738521 diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c index 92eb71c..e482bd9 100644 --- a/src/utils/modelsolver.c +++ b/src/utils/modelsolver.c @@ -39,7 +39,7 @@ #define LENTROPY (1) #define LTEMPERATURE (0) #define LFORCING (0) -#define LUPWD (1) +#define LUPWD (0) #define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter typedef AcReal Scalar; From 17a4f314519dad18c25caf920c709d6da5ecfc2e Mon Sep 17 00:00:00 2001 From: jpekkila Date: Thu, 4 Jun 2020 20:47:03 +0300 Subject: [PATCH 02/10] Added the latest setup used for benchmarks --- CMakeLists.txt | 6 +- config/astaroth.conf | 14 ++--- samples/benchmark/main.cc | 65 ++++++++++++++++++++- samples/genbenchmarkscripts/main.c | 10 +++- src/core/device.cc | 93 ++++++++++++++++++++++++++++-- src/core/kernels/integration.cuh | 21 ++++--- 6 files changed, 182 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45c3a2b..04100bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ ## CMake settings # V3.9 required for first-class CUDA support # V3.17 required for the FindCUDAToolkit package -cmake_minimum_required(VERSION 3.17) +cmake_minimum_required(VERSION 3.17) find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH) find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH) @@ -10,7 +10,7 @@ project(astaroth C CXX CUDA) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) ## Project-wide compilation flags -set(COMMON_FLAGS "-mavx -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") +set(COMMON_FLAGS "-mavx -Wall -Wextra -Wdouble-promotion -Wfloat-conversion -Wshadow") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") set(CMAKE_C_STANDARD 11) @@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 11) find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_... cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}") -set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow") +set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Wdouble-promotion,-Wfloat-conversion,-Wshadow") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}") diff --git a/config/astaroth.conf b/config/astaroth.conf index ccefc45..83e93d9 100644 --- a/config/astaroth.conf +++ b/config/astaroth.conf @@ -5,9 +5,9 @@ * "Compile-time" params * ============================================================================= */ -AC_nx = 512 -AC_ny = 512 -AC_nz = 512 +AC_nx = 256 +AC_ny = 256 +AC_nz = 256 AC_dsx = 0.04908738521 AC_dsy = 0.04908738521 @@ -24,11 +24,11 @@ AC_bin_steps = 1000 AC_bin_save_t = 1e666 // Set to 0 if you want to run the simulation from the beginning, or just a new -// simulation. If continuing from a saved step, specify the step number here. -AC_start_step = 0 +// simulation. If continuing from a saved step, specify the step number here. +AC_start_step = 0 // Maximum time in code units. If negative, there is no time limit -AC_max_time = -1.0 +AC_max_time = -1.0 // Hydro AC_cdt = 0.4 @@ -49,7 +49,7 @@ AC_forcing_magnitude = 1e-5 AC_kmin = 0.8 AC_kmax = 1.2 // Switches forcing off and accretion on -AC_switch_accretion = 0 +AC_switch_accretion = 0 // Entropy AC_cp_sound = 1.0 diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index 5ab4349..dd14129 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -39,8 +39,46 @@ typedef enum { NUM_TESTS, } TestType; +#include + +typedef struct { + uint64_t x, y, z; +} uint3_64; + +static uint3_64 +operator+(const uint3_64& a, const uint3_64& b) +{ + return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z}; +} + +static uint3_64 +morton3D(const uint64_t pid) +{ + uint64_t i, j, k; + i = j = k = 0; + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 3 * bit; + i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; + j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; + k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; + } + + return (uint3_64){i, j, k}; +} + +static uint3_64 +decompose(const uint64_t target) +{ + // This is just so beautifully elegant. Complex and efficient decomposition + // in just one line of code. + uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1}; + + ERRCHK_ALWAYS(p.x * p.y * p.z == target); + return p; +} + int -main(void) +main(int argc, char** argv) { MPI_Init(NULL, NULL); int nprocs, pid; @@ -51,9 +89,30 @@ main(void) AcMeshInfo info; acLoadConfig(AC_DEFAULT_CONFIG, &info); + if (argc > 1) { + if (argc == 4) { + const int nx = atoi(argv[1]); + const int ny = atoi(argv[2]); + const int nz = atoi(argv[3]); + info.int_params[AC_nx] = nx; + info.int_params[AC_ny] = ny; + info.int_params[AC_nz] = nz; + acUpdateBuiltinParams(&info); + printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz); + } + else { + fprintf(stderr, "Could not parse arguments. Usage: ./benchmark .\n"); + exit(EXIT_FAILURE); + } + } + const TestType test = TEST_STRONG_SCALING; - if (test == TEST_WEAK_SCALING) - info.int_params[AC_nz] *= nprocs; + if (test == TEST_WEAK_SCALING) { + uint3_64 decomp = decompose(nprocs); + info.int_params[AC_nx] *= decomp.x; + info.int_params[AC_ny] *= decomp.y; + info.int_params[AC_nz] *= decomp.z; + } /* AcMesh model, candidate; diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index 8d35ae9..6f160b3 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -36,8 +36,14 @@ main(void) // Profile and run fprintf(fp, "mkdir -p profile_%d\n", nprocs); - fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n", - nprocs); + + const int nx = 1792; + const int ny = nx; + const int nz = nx; + fprintf(fp, + "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " + "%d\n", + nprocs, nx, ny, nz); fclose(fp); } diff --git a/src/core/device.cc b/src/core/device.cc index f473fc2..1e070cb 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -518,6 +518,78 @@ mod(const int a, const int b) return r < 0 ? r + b : r; } +#define DECOMPOSITION_AXES (3) + +static uint3_64 +morton3D(const uint64_t pid) +{ + uint64_t i, j, k; + i = j = k = 0; + + if (DECOMPOSITION_AXES == 3) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 3 * bit; + i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; + j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; + k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; + } + } + // Just a quick copy/paste for other decomp dims + else if (DECOMPOSITION_AXES == 2) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 2 * bit; + i |= ((pid & (mask << 0)) >> 1 * bit) >> 0; + j |= ((pid & (mask << 1)) >> 1 * bit) >> 1; + } + } + else if (DECOMPOSITION_AXES == 1) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 1 * bit; + i |= ((pid & (mask << 0)) >> 0 * bit) >> 0; + } + } + else { + fprintf(stderr, "Invalid DECOMPOSITION_AXES\n"); + ERRCHK_ALWAYS(0); + } + + return (uint3_64){i, j, k}; +} + +static uint64_t +morton1D(const uint3_64 pid) +{ + uint64_t i = 0; + + if (DECOMPOSITION_AXES == 3) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << bit; + i |= ((pid.x & mask) << 0) << 2 * bit; + i |= ((pid.y & mask) << 1) << 2 * bit; + i |= ((pid.z & mask) << 2) << 2 * bit; + } + } + else if (DECOMPOSITION_AXES == 2) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << bit; + i |= ((pid.x & mask) << 0) << 1 * bit; + i |= ((pid.y & mask) << 1) << 1 * bit; + } + } + else if (DECOMPOSITION_AXES == 1) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << bit; + i |= ((pid.x & mask) << 0) << 0 * bit; + } + } + else { + fprintf(stderr, "Invalid DECOMPOSITION_AXES\n"); + ERRCHK_ALWAYS(0); + } + + return i; +} +/* static uint3_64 morton3D(const uint64_t pid) { @@ -545,6 +617,7 @@ morton1D(const uint3_64 pid) } return i; } +*/ static uint3_64 decompose(const uint64_t target) @@ -1277,15 +1350,18 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh) return AC_SUCCESS; } +#define MPI_COMPUTE_ENABLED (1) +#define MPI_COMM_ENABLED (1) + AcResult acGridIntegrate(const Stream stream, const AcReal dt) { ERRCHK(grid.initialized); // acGridSynchronizeStream(stream); - const Device device = grid.device; - const int3 nn = grid.nn; - //CommData corner_data = grid.corner_data; // Do not rm: required for corners + const Device device = grid.device; + const int3 nn = grid.nn; + // CommData corner_data = grid.corner_data; // Do not rm: required for corners CommData edgex_data = grid.edgex_data; CommData edgey_data = grid.edgey_data; CommData edgez_data = grid.edgez_data; @@ -1357,6 +1433,8 @@ acGridIntegrate(const Stream stream, const AcReal dt) }; for (int isubstep = 0; isubstep < 3; ++isubstep) { + +#if MPI_COMM_ENABLED // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners acPackCommData(device, edgex_b0s, &edgex_data); acPackCommData(device, edgey_b0s, &edgey_data); @@ -1364,15 +1442,19 @@ acGridIntegrate(const Stream stream, const AcReal dt) acPackCommData(device, sidexy_b0s, &sidexy_data); acPackCommData(device, sidexz_b0s, &sidexz_data); acPackCommData(device, sideyz_b0s, &sideyz_data); +#endif +#if MPI_COMPUTE_ENABLED //////////// INNER INTEGRATION ////////////// { const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 m2 = nn; acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); } - //////////////////////////////////////////// +//////////////////////////////////////////// +#endif // MPI_COMPUTE_ENABLED +#if MPI_COMM_ENABLED MPI_Barrier(MPI_COMM_WORLD); #if MPI_GPUDIRECT_DISABLED @@ -1436,6 +1518,8 @@ acGridIntegrate(const Stream stream, const AcReal dt) acSyncCommData(sidexy_data); acSyncCommData(sidexz_data); acSyncCommData(sideyz_data); +#endif // MPI_COMM_ENABLED +#if MPI_COMPUTE_ENABLED { // Front const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; @@ -1466,6 +1550,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); } +#endif // MPI_COMPUTE_ENABLED acDeviceSwapBuffers(device); acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done //////////////////////////////////////////// diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh index 8d66fd2..97326ad 100644 --- a/src/core/kernels/integration.cuh +++ b/src/core/kernels/integration.cuh @@ -41,10 +41,12 @@ static __device__ __forceinline__ AcReal3 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current, const AcReal3 rate_of_change, const AcReal dt) { - return (AcReal3){ - rk3_integrate(state_previous.x, state_current.x, rate_of_change.x, dt), - rk3_integrate(state_previous.y, state_current.y, rate_of_change.y, dt), - rk3_integrate(state_previous.z, state_current.z, rate_of_change.z, dt)}; + return (AcReal3){rk3_integrate(state_previous.x, state_current.x, rate_of_change.x, + dt), + rk3_integrate(state_previous.y, state_current.y, rate_of_change.y, + dt), + rk3_integrate(state_previous.z, state_current.z, rate_of_change.z, + dt)}; } #define rk3(state_previous, state_current, rate_of_change, dt) \ @@ -132,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr // RK3 dim3 best_dims(0, 0, 0); float best_time = INFINITY; - const int num_iterations = 10; + const int num_iterations = 5; for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { @@ -192,9 +194,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr } } #if VERBOSE_PRINTING - printf( - "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n", - best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); + printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f " + "ms\n", + best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); #endif /* FILE* fp = fopen("../config/rk3_tbdims.cuh", "w"); @@ -204,6 +206,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr */ rk3_tpb = best_dims; + + // Failed to find valid thread block dimensions + ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0); return AC_SUCCESS; } From 9840b817d081339cc04272225a7505bad3b4b0c0 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Sun, 7 Jun 2020 21:59:33 +0300 Subject: [PATCH 03/10] Added the (hopefully final) basic test case used for the benchmarks --- samples/benchmark/main.cc | 7 ++-- samples/genbenchmarkscripts/main.c | 9 ++--- src/core/device.cc | 57 +++++++++++++++++++++--------- src/core/kernels/integration.cuh | 2 +- 4 files changed, 50 insertions(+), 25 deletions(-) diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index dd14129..962a316 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -56,11 +56,12 @@ morton3D(const uint64_t pid) { uint64_t i, j, k; i = j = k = 0; + for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 3 * bit; - i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; + k |= ((pid & (mask << 0)) >> 2 * bit) >> 0; j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; - k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; + i |= ((pid & (mask << 2)) >> 2 * bit) >> 2; } return (uint3_64){i, j, k}; @@ -174,7 +175,7 @@ main(int argc, char** argv) */ // Percentiles - const size_t num_iters = 100; + const size_t num_iters = 1000; const double nth_percentile = 0.90; std::vector results; // ms results.reserve(num_iters); diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index 6f160b3..a45bf1a 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -29,6 +29,7 @@ main(void) fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); fprintf(fp, "#SBATCH -n %d\n", nprocs); fprintf(fp, "#SBATCH -N %d\n", nodes); + fprintf(fp, "#SBATCH --exclusive\n"); // Modules fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); @@ -37,13 +38,13 @@ main(void) // Profile and run fprintf(fp, "mkdir -p profile_%d\n", nprocs); - const int nx = 1792; + const int nx = 256; // max size 1792; const int ny = nx; const int nz = nx; fprintf(fp, - "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " - "%d\n", - nprocs, nx, ny, nz); + //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " + //"%d\n", + "srun ./benchmark %d %d %d\n", nx, ny, nz); fclose(fp); } diff --git a/src/core/device.cc b/src/core/device.cc index 1e070cb..35af82d 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -527,6 +527,15 @@ morton3D(const uint64_t pid) i = j = k = 0; if (DECOMPOSITION_AXES == 3) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 3 * bit; + k |= ((pid & (mask << 0)) >> 2 * bit) >> 0; + j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; + i |= ((pid & (mask << 2)) >> 2 * bit) >> 2; + } + } + /* + else if (DECOMPOSITION_AXES == 3) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 3 * bit; i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; @@ -534,18 +543,19 @@ morton3D(const uint64_t pid) k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; } } + */ // Just a quick copy/paste for other decomp dims else if (DECOMPOSITION_AXES == 2) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 2 * bit; - i |= ((pid & (mask << 0)) >> 1 * bit) >> 0; - j |= ((pid & (mask << 1)) >> 1 * bit) >> 1; + j |= ((pid & (mask << 0)) >> 1 * bit) >> 0; + k |= ((pid & (mask << 1)) >> 1 * bit) >> 1; } } else if (DECOMPOSITION_AXES == 1) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 1 * bit; - i |= ((pid & (mask << 0)) >> 0 * bit) >> 0; + k |= ((pid & (mask << 0)) >> 0 * bit) >> 0; } } else { @@ -562,24 +572,33 @@ morton1D(const uint3_64 pid) uint64_t i = 0; if (DECOMPOSITION_AXES == 3) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << bit; + i |= ((pid.z & mask) << 0) << 2 * bit; + i |= ((pid.y & mask) << 1) << 2 * bit; + i |= ((pid.x & mask) << 2) << 2 * bit; + } + } + /* + else if (DECOMPOSITION_AXES == 3) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; i |= ((pid.x & mask) << 0) << 2 * bit; i |= ((pid.y & mask) << 1) << 2 * bit; i |= ((pid.z & mask) << 2) << 2 * bit; } - } + }*/ else if (DECOMPOSITION_AXES == 2) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; - i |= ((pid.x & mask) << 0) << 1 * bit; - i |= ((pid.y & mask) << 1) << 1 * bit; + i |= ((pid.y & mask) << 0) << 1 * bit; + i |= ((pid.z & mask) << 1) << 1 * bit; } } else if (DECOMPOSITION_AXES == 1) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; - i |= ((pid.x & mask) << 0) << 0 * bit; + i |= ((pid.z & mask) << 0) << 0 * bit; } } else { @@ -1204,6 +1223,8 @@ typedef struct { CommData sidexy_data; CommData sidexz_data; CommData sideyz_data; + + // int comm_cart; } Grid; static Grid grid = {}; @@ -1444,16 +1465,6 @@ acGridIntegrate(const Stream stream, const AcReal dt) acPackCommData(device, sideyz_b0s, &sideyz_data); #endif -#if MPI_COMPUTE_ENABLED - //////////// INNER INTEGRATION ////////////// - { - const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = nn; - acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); - } -//////////////////////////////////////////// -#endif // MPI_COMPUTE_ENABLED - #if MPI_COMM_ENABLED MPI_Barrier(MPI_COMM_WORLD); @@ -1474,7 +1485,19 @@ acGridIntegrate(const Stream stream, const AcReal dt) acTransferCommData(device, sidexy_b0s, &sidexy_data); acTransferCommData(device, sidexz_b0s, &sidexz_data); acTransferCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMM_ENABLED +#if MPI_COMPUTE_ENABLED + //////////// INNER INTEGRATION ////////////// + { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = nn; + acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); + } +//////////////////////////////////////////// +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED // acTransferCommDataWait(corner_data); // Do not rm: required for corners acTransferCommDataWait(edgex_data); acTransferCommDataWait(edgey_data); diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh index 97326ad..4c01148 100644 --- a/src/core/kernels/integration.cuh +++ b/src/core/kernels/integration.cuh @@ -134,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr // RK3 dim3 best_dims(0, 0, 0); float best_time = INFINITY; - const int num_iterations = 5; + const int num_iterations = 10; for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { From fa422cf4575cb0a8eace8f0409693e881fc0f709 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 10 Jun 2020 02:16:23 +0300 Subject: [PATCH 04/10] Added a better-pipelined version of the acGridIntegrate and a switch for toggling the transfer of corners --- src/core/device.cc | 211 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) diff --git a/src/core/device.cc b/src/core/device.cc index 35af82d..481b465 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -1373,9 +1373,220 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh) #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) +#define MPI_INCL_CORNERS (0) AcResult acGridIntegrate(const Stream stream, const AcReal dt) +{ + ERRCHK(grid.initialized); + acGridSynchronizeStream(stream); + + const Device device = grid.device; + const int3 nn = grid.nn; +#if MPI_INCL_CORNERS + CommData corner_data = grid.corner_data; // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + CommData edgex_data = grid.edgex_data; + CommData edgey_data = grid.edgey_data; + CommData edgez_data = grid.edgez_data; + CommData sidexy_data = grid.sidexy_data; + CommData sidexz_data = grid.sidexz_data; + CommData sideyz_data = grid.sideyz_data; + +// Corners +#if MPI_INCL_CORNERS + // Do not rm: required for corners + const int3 corner_b0s[] = { + (int3){0, 0, 0}, + (int3){NGHOST + nn.x, 0, 0}, + (int3){0, NGHOST + nn.y, 0}, + (int3){0, 0, NGHOST + nn.z}, + + (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, + (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, + (int3){0, NGHOST + nn.y, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, + }; +#endif // MPI_INCL_CORNERS + + // Edges X + const int3 edgex_b0s[] = { + (int3){NGHOST, 0, 0}, + (int3){NGHOST, NGHOST + nn.y, 0}, + + (int3){NGHOST, 0, NGHOST + nn.z}, + (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, + }; + + // Edges Y + const int3 edgey_b0s[] = { + (int3){0, NGHOST, 0}, + (int3){NGHOST + nn.x, NGHOST, 0}, + + (int3){0, NGHOST, NGHOST + nn.z}, + (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, + }; + + // Edges Z + const int3 edgez_b0s[] = { + (int3){0, 0, NGHOST}, + (int3){NGHOST + nn.x, 0, NGHOST}, + + (int3){0, NGHOST + nn.y, NGHOST}, + (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, + }; + + // Sides XY + const int3 sidexy_b0s[] = { + (int3){NGHOST, NGHOST, 0}, // + (int3){NGHOST, NGHOST, NGHOST + nn.z}, // + }; + + // Sides XZ + const int3 sidexz_b0s[] = { + (int3){NGHOST, 0, NGHOST}, // + (int3){NGHOST, NGHOST + nn.y, NGHOST}, // + }; + + // Sides YZ + const int3 sideyz_b0s[] = { + (int3){0, NGHOST, NGHOST}, // + (int3){NGHOST + nn.x, NGHOST, NGHOST}, // + }; + + for (int isubstep = 0; isubstep < 3; ++isubstep) { + acDeviceSynchronizeStream(device, STREAM_ALL); + MPI_Barrier(MPI_COMM_WORLD); + +#if MPI_COMPUTE_ENABLED + acPackCommData(device, sidexy_b0s, &sidexy_data); + acPackCommData(device, sidexz_b0s, &sidexz_data); + acPackCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED + acTransferCommData(device, sidexy_b0s, &sidexy_data); + acTransferCommData(device, sidexz_b0s, &sidexz_data); + acTransferCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMM_ENABLED + +#if MPI_COMPUTE_ENABLED + //////////// INNER INTEGRATION ////////////// + { + const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = nn; + acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); + } + + acPackCommData(device, edgex_b0s, &edgex_data); + acPackCommData(device, edgey_b0s, &edgey_data); + acPackCommData(device, edgez_b0s, &edgez_data); +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED + acTransferCommDataWait(sidexy_data); + acUnpinCommData(device, &sidexy_data); + acTransferCommDataWait(sidexz_data); + acUnpinCommData(device, &sidexz_data); + acTransferCommDataWait(sideyz_data); + acUnpinCommData(device, &sideyz_data); + + acTransferCommData(device, edgex_b0s, &edgex_data); + acTransferCommData(device, edgey_b0s, &edgey_data); + acTransferCommData(device, edgez_b0s, &edgez_data); +#endif // MPI_COMM_ENABLED + +#if MPI_COMPUTE_ENABLED +#if MPI_INCL_CORNERS + acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + acUnpackCommData(device, sidexy_b0s, &sidexy_data); + acUnpackCommData(device, sidexz_b0s, &sidexz_data); + acUnpackCommData(device, sideyz_b0s, &sideyz_data); +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED + acTransferCommDataWait(edgex_data); + acUnpinCommData(device, &edgex_data); + acTransferCommDataWait(edgey_data); + acUnpinCommData(device, &edgey_data); + acTransferCommDataWait(edgez_data); + acUnpinCommData(device, &edgez_data); + +#if MPI_INCL_CORNERS + acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS +#endif // MPI_COMM_ENABLED + +#if MPI_COMPUTE_ENABLED + acUnpackCommData(device, edgex_b0s, &edgex_data); + acUnpackCommData(device, edgey_b0s, &edgey_data); + acUnpackCommData(device, edgez_b0s, &edgez_data); +#endif // MPI_COMPUTE_ENABLED + +#if MPI_COMM_ENABLED +#if MPI_INCL_CORNERS + acTransferCommDataWait(corner_data); // Do not rm: required for corners + acUnpinCommData(device, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS +#endif // MPI_COMM_ENABLED +#if MPI_COMPUTE_ENABLED +#if MPI_INCL_CORNERS + acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS +#endif // MPI_COMPUTE_ENABLED + + // Wait for unpacking + acSyncCommData(sidexy_data); + acSyncCommData(sidexz_data); + acSyncCommData(sideyz_data); + acSyncCommData(edgex_data); + acSyncCommData(edgey_data); + acSyncCommData(edgez_data); +#if MPI_INCL_CORNERS + acSyncCommData(corner_data); // Do not rm: required for corners +#endif // MPI_INCL_CORNERS + +#if MPI_COMPUTE_ENABLED + { // Front + const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt); + } + { // Back + const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; + const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt); + } + { // Bottom + const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt); + } + { // Top + const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; + const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt); + } + { // Left + const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt); + } + { // Right + const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; + const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; + acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); + } +#endif // MPI_COMPUTE_ENABLED + acDeviceSwapBuffers(device); + } + + return AC_SUCCESS; +} + +AcResult +acGridIntegrateORIGINAL(const Stream stream, const AcReal dt) { ERRCHK(grid.initialized); // acGridSynchronizeStream(stream); From 1cdb9e2ce7c3eb675bf466b6bd5e3a6432c956d3 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 10 Jun 2020 12:32:56 +0300 Subject: [PATCH 05/10] Added missing synchronization to the end of the new integration function --- src/core/device.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/core/device.cc b/src/core/device.cc index 481b465..f47f7a0 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -12,6 +12,11 @@ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) #define MPI_GPUDIRECT_DISABLED (0) +#define DECOMPOSITION_AXES (3) +#define MPI_COMPUTE_ENABLED (1) +#define MPI_COMM_ENABLED (1) +#define MPI_INCL_CORNERS (0) + AcResult acDevicePrintInfo(const Device device) { @@ -518,8 +523,6 @@ mod(const int a, const int b) return r < 0 ? r + b : r; } -#define DECOMPOSITION_AXES (3) - static uint3_64 morton3D(const uint64_t pid) { @@ -1371,10 +1374,6 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh) return AC_SUCCESS; } -#define MPI_COMPUTE_ENABLED (1) -#define MPI_COMM_ENABLED (1) -#define MPI_INCL_CORNERS (0) - AcResult acGridIntegrate(const Stream stream, const AcReal dt) { @@ -1582,6 +1581,9 @@ acGridIntegrate(const Stream stream, const AcReal dt) acDeviceSwapBuffers(device); } + // Does not have to be STREAM_ALL, only the streams used with + // acDeviceIntegrateSubstep (less likely to break this way though) + acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done return AC_SUCCESS; } From 0e4b39d6d78210b57bd8b38dd09ec770bde2e1b4 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Thu, 11 Jun 2020 11:28:52 +0300 Subject: [PATCH 06/10] Added a toggle for using pinned memory --- src/core/device.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/core/device.cc b/src/core/device.cc index f47f7a0..01d5e87 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -16,6 +16,7 @@ #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) #define MPI_INCL_CORNERS (0) +#define MPI_USE_PINNED (1) AcResult acDevicePrintInfo(const Device device) @@ -1165,7 +1166,7 @@ acTransferCommData(const Device device, // const int npid = getPid(pid3d + neighbor, decomp); PackedData* dst = &data->dsts[b0_idx]; - if (onTheSameNode(pid, npid)) { + if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) { MPI_Irecv(dst->data, count, datatype, npid, b0_idx, // MPI_COMM_WORLD, &data->recv_reqs[b0_idx]); dst->pinned = false; @@ -1187,7 +1188,7 @@ acTransferCommData(const Device device, // const int npid = getPid(pid3d - neighbor, decomp); PackedData* src = &data->srcs[b0_idx]; - if (onTheSameNode(pid, npid)) { + if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) { cudaStreamSynchronize(data->streams[b0_idx]); MPI_Isend(src->data, count, datatype, npid, b0_idx, // MPI_COMM_WORLD, &data->send_reqs[b0_idx]); From f04e347c451c1555ce24b0e24c37cfcf699c6289 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 24 Jun 2020 15:13:15 +0300 Subject: [PATCH 07/10] Cleanup before merging to the master merge candidate branch --- samples/bwtest/CMakeLists.txt | 2 +- samples/bwtest/main.c | 28 ++++ src/core/CMakeLists.txt | 2 +- src/core/device.cc | 289 +++------------------------------- 4 files changed, 52 insertions(+), 269 deletions(-) diff --git a/samples/bwtest/CMakeLists.txt b/samples/bwtest/CMakeLists.txt index cd4329f..229e7e2 100644 --- a/samples/bwtest/CMakeLists.txt +++ b/samples/bwtest/CMakeLists.txt @@ -5,5 +5,5 @@ find_package(OpenMP) find_package(CUDAToolkit) add_executable(bwtest main.c) -target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static) +target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver) target_compile_options(bwtest PRIVATE -O3) diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c index 73f4387..9dd25d9 100644 --- a/samples/bwtest/main.c +++ b/samples/bwtest/main.c @@ -7,6 +7,7 @@ #include #include +#include // CUDA driver API #include "timer_hires.h" // From src/common @@ -56,6 +57,17 @@ allocDevice(const size_t bytes) static uint8_t* allocDevicePinned(const size_t bytes) { + #define USE_CUDA_DRIVER_PINNING (1) + #if USE_CUDA_DRIVER_PINNING + uint8_t* arr = allocDevice(bytes); + + unsigned int flag = 1; + CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr); + + errchk(retval == CUDA_SUCCESS); + return arr; + + #else uint8_t* arr; // Standard (20 GiB/s internode, 85 GiB/s intranode) // const cudaError_t retval = cudaMalloc((void**)&arr, bytes); @@ -65,8 +77,24 @@ allocDevicePinned(const size_t bytes) const cudaError_t retval = cudaMallocHost((void**)&arr, bytes); errchk(retval == cudaSuccess); return arr; + #endif } +/* +static uint8_t* +allocDevicePinned(const size_t bytes) +{ + uint8_t* arr; + // Standard (20 GiB/s internode, 85 GiB/s intranode) + // const cudaError_t retval = cudaMalloc((void**)&arr, bytes); + // Unified mem (5 GiB/s internode, 6 GiB/s intranode) + // const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal); + // Pinned (40 GiB/s internode, 10 GiB/s intranode) + const cudaError_t retval = cudaMallocHost((void**)&arr, bytes); + errchk(retval == cudaSuccess); + return arr; +}*/ + static void freeDevice(uint8_t* arr) { diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 757cbfe..81bcf14 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -2,7 +2,7 @@ find_package(CUDAToolkit) ## Astaroth Core add_library(astaroth_core STATIC device.cc node.cc astaroth.cc) -target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart) +target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver) ## Options if (MPI_ENABLED) diff --git a/src/core/device.cc b/src/core/device.cc index 01d5e87..688ed89 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -10,13 +10,16 @@ #include "kernels/kernels.h" #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) -#define MPI_GPUDIRECT_DISABLED (0) -#define DECOMPOSITION_AXES (3) +#define MPI_GPUDIRECT_DISABLED (0) // Buffer through host memory, deprecated +#define MPI_DECOMPOSITION_AXES (3) #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) #define MPI_INCL_CORNERS (0) -#define MPI_USE_PINNED (1) +#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory +#define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost + +#include // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set) AcResult acDevicePrintInfo(const Device device) @@ -530,7 +533,7 @@ morton3D(const uint64_t pid) uint64_t i, j, k; i = j = k = 0; - if (DECOMPOSITION_AXES == 3) { + if (MPI_DECOMPOSITION_AXES == 3) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 3 * bit; k |= ((pid & (mask << 0)) >> 2 * bit) >> 0; @@ -538,32 +541,22 @@ morton3D(const uint64_t pid) i |= ((pid & (mask << 2)) >> 2 * bit) >> 2; } } - /* - else if (DECOMPOSITION_AXES == 3) { - for (int bit = 0; bit <= 21; ++bit) { - const uint64_t mask = 0x1l << 3 * bit; - i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; - j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; - k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; - } - } - */ // Just a quick copy/paste for other decomp dims - else if (DECOMPOSITION_AXES == 2) { + else if (MPI_DECOMPOSITION_AXES == 2) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 2 * bit; j |= ((pid & (mask << 0)) >> 1 * bit) >> 0; k |= ((pid & (mask << 1)) >> 1 * bit) >> 1; } } - else if (DECOMPOSITION_AXES == 1) { + else if (MPI_DECOMPOSITION_AXES == 1) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << 1 * bit; k |= ((pid & (mask << 0)) >> 0 * bit) >> 0; } } else { - fprintf(stderr, "Invalid DECOMPOSITION_AXES\n"); + fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n"); ERRCHK_ALWAYS(0); } @@ -575,7 +568,7 @@ morton1D(const uint3_64 pid) { uint64_t i = 0; - if (DECOMPOSITION_AXES == 3) { + if (MPI_DECOMPOSITION_AXES == 3) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; i |= ((pid.z & mask) << 0) << 2 * bit; @@ -583,64 +576,26 @@ morton1D(const uint3_64 pid) i |= ((pid.x & mask) << 2) << 2 * bit; } } - /* - else if (DECOMPOSITION_AXES == 3) { - for (int bit = 0; bit <= 21; ++bit) { - const uint64_t mask = 0x1l << bit; - i |= ((pid.x & mask) << 0) << 2 * bit; - i |= ((pid.y & mask) << 1) << 2 * bit; - i |= ((pid.z & mask) << 2) << 2 * bit; - } - }*/ - else if (DECOMPOSITION_AXES == 2) { + else if (MPI_DECOMPOSITION_AXES == 2) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; i |= ((pid.y & mask) << 0) << 1 * bit; i |= ((pid.z & mask) << 1) << 1 * bit; } } - else if (DECOMPOSITION_AXES == 1) { + else if (MPI_DECOMPOSITION_AXES == 1) { for (int bit = 0; bit <= 21; ++bit) { const uint64_t mask = 0x1l << bit; i |= ((pid.z & mask) << 0) << 0 * bit; } } else { - fprintf(stderr, "Invalid DECOMPOSITION_AXES\n"); + fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n"); ERRCHK_ALWAYS(0); } return i; } -/* -static uint3_64 -morton3D(const uint64_t pid) -{ - uint64_t i, j, k; - i = j = k = 0; - for (int bit = 0; bit <= 21; ++bit) { - const uint64_t mask = 0x1l << 3 * bit; - i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; - j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; - k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; - } - - return (uint3_64){i, j, k}; -} - -static uint64_t -morton1D(const uint3_64 pid) -{ - uint64_t i = 0; - for (int bit = 0; bit <= 21; ++bit) { - const uint64_t mask = 0x1l << bit; - i |= ((pid.x & mask) << 0) << 2 * bit; - i |= ((pid.y & mask) << 1) << 2 * bit; - i |= ((pid.z & mask) << 2) << 2 * bit; - } - return i; -} -*/ static uint3_64 decompose(const uint64_t target) @@ -701,9 +656,17 @@ acCreatePackedData(const int3 dims) const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes)); + #if MPI_USE_CUDA_DRIVER_PINNING + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes)); + + unsigned int flag = 1; + CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned); + ERRCHK_ALWAYS(retval == CUDA_SUCCESS); + #else ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes)); // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly // slower than pinned (38 ms vs. 125 ms) + #fi // USE_CUDA_DRIVER_PINNING return data; } @@ -1588,214 +1551,6 @@ acGridIntegrate(const Stream stream, const AcReal dt) return AC_SUCCESS; } -AcResult -acGridIntegrateORIGINAL(const Stream stream, const AcReal dt) -{ - ERRCHK(grid.initialized); - // acGridSynchronizeStream(stream); - - const Device device = grid.device; - const int3 nn = grid.nn; - // CommData corner_data = grid.corner_data; // Do not rm: required for corners - CommData edgex_data = grid.edgex_data; - CommData edgey_data = grid.edgey_data; - CommData edgez_data = grid.edgez_data; - CommData sidexy_data = grid.sidexy_data; - CommData sidexz_data = grid.sidexz_data; - CommData sideyz_data = grid.sideyz_data; - - acDeviceSynchronizeStream(device, stream); - - // Corners - /* - // Do not rm: required for corners - const int3 corner_b0s[] = { - (int3){0, 0, 0}, - (int3){NGHOST + nn.x, 0, 0}, - (int3){0, NGHOST + nn.y, 0}, - (int3){0, 0, NGHOST + nn.z}, - - (int3){NGHOST + nn.x, NGHOST + nn.y, 0}, - (int3){NGHOST + nn.x, 0, NGHOST + nn.z}, - (int3){0, NGHOST + nn.y, NGHOST + nn.z}, - (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z}, - }; - */ - - // Edges X - const int3 edgex_b0s[] = { - (int3){NGHOST, 0, 0}, - (int3){NGHOST, NGHOST + nn.y, 0}, - - (int3){NGHOST, 0, NGHOST + nn.z}, - (int3){NGHOST, NGHOST + nn.y, NGHOST + nn.z}, - }; - - // Edges Y - const int3 edgey_b0s[] = { - (int3){0, NGHOST, 0}, - (int3){NGHOST + nn.x, NGHOST, 0}, - - (int3){0, NGHOST, NGHOST + nn.z}, - (int3){NGHOST + nn.x, NGHOST, NGHOST + nn.z}, - }; - - // Edges Z - const int3 edgez_b0s[] = { - (int3){0, 0, NGHOST}, - (int3){NGHOST + nn.x, 0, NGHOST}, - - (int3){0, NGHOST + nn.y, NGHOST}, - (int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST}, - }; - - // Sides XY - const int3 sidexy_b0s[] = { - (int3){NGHOST, NGHOST, 0}, // - (int3){NGHOST, NGHOST, NGHOST + nn.z}, // - }; - - // Sides XZ - const int3 sidexz_b0s[] = { - (int3){NGHOST, 0, NGHOST}, // - (int3){NGHOST, NGHOST + nn.y, NGHOST}, // - }; - - // Sides YZ - const int3 sideyz_b0s[] = { - (int3){0, NGHOST, NGHOST}, // - (int3){NGHOST + nn.x, NGHOST, NGHOST}, // - }; - - for (int isubstep = 0; isubstep < 3; ++isubstep) { - -#if MPI_COMM_ENABLED - // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners - acPackCommData(device, edgex_b0s, &edgex_data); - acPackCommData(device, edgey_b0s, &edgey_data); - acPackCommData(device, edgez_b0s, &edgez_data); - acPackCommData(device, sidexy_b0s, &sidexy_data); - acPackCommData(device, sidexz_b0s, &sidexz_data); - acPackCommData(device, sideyz_b0s, &sideyz_data); -#endif - -#if MPI_COMM_ENABLED - MPI_Barrier(MPI_COMM_WORLD); - -#if MPI_GPUDIRECT_DISABLED - // acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners - acTransferCommDataToHost(device, &edgex_data); - acTransferCommDataToHost(device, &edgey_data); - acTransferCommDataToHost(device, &edgez_data); - acTransferCommDataToHost(device, &sidexy_data); - acTransferCommDataToHost(device, &sidexz_data); - acTransferCommDataToHost(device, &sideyz_data); -#endif - - // acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners - acTransferCommData(device, edgex_b0s, &edgex_data); - acTransferCommData(device, edgey_b0s, &edgey_data); - acTransferCommData(device, edgez_b0s, &edgez_data); - acTransferCommData(device, sidexy_b0s, &sidexy_data); - acTransferCommData(device, sidexz_b0s, &sidexz_data); - acTransferCommData(device, sideyz_b0s, &sideyz_data); -#endif // MPI_COMM_ENABLED - -#if MPI_COMPUTE_ENABLED - //////////// INNER INTEGRATION ////////////// - { - const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = nn; - acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); - } -//////////////////////////////////////////// -#endif // MPI_COMPUTE_ENABLED - -#if MPI_COMM_ENABLED - // acTransferCommDataWait(corner_data); // Do not rm: required for corners - acTransferCommDataWait(edgex_data); - acTransferCommDataWait(edgey_data); - acTransferCommDataWait(edgez_data); - acTransferCommDataWait(sidexy_data); - acTransferCommDataWait(sidexz_data); - acTransferCommDataWait(sideyz_data); - -#if MPI_GPUDIRECT_DISABLED - // acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners - acTransferCommDataToDevice(device, &edgex_data); - acTransferCommDataToDevice(device, &edgey_data); - acTransferCommDataToDevice(device, &edgez_data); - acTransferCommDataToDevice(device, &sidexy_data); - acTransferCommDataToDevice(device, &sidexz_data); - acTransferCommDataToDevice(device, &sideyz_data); -#endif - - // acUnpinCommData(device, &corner_data); // Do not rm: required for corners - acUnpinCommData(device, &edgex_data); - acUnpinCommData(device, &edgey_data); - acUnpinCommData(device, &edgez_data); - acUnpinCommData(device, &sidexy_data); - acUnpinCommData(device, &sidexz_data); - acUnpinCommData(device, &sideyz_data); - - // acUnpackCommData(device, corner_b0s, &corner_data); - acUnpackCommData(device, edgex_b0s, &edgex_data); - acUnpackCommData(device, edgey_b0s, &edgey_data); - acUnpackCommData(device, edgez_b0s, &edgez_data); - acUnpackCommData(device, sidexy_b0s, &sidexy_data); - acUnpackCommData(device, sidexz_b0s, &sidexz_data); - acUnpackCommData(device, sideyz_b0s, &sideyz_data); - //////////// OUTER INTEGRATION ////////////// - - // Wait for unpacking - // acSyncCommData(corner_data); // Do not rm: required for corners - acSyncCommData(edgex_data); - acSyncCommData(edgey_data); - acSyncCommData(edgez_data); - acSyncCommData(sidexy_data); - acSyncCommData(sidexz_data); - acSyncCommData(sideyz_data); -#endif // MPI_COMM_ENABLED -#if MPI_COMPUTE_ENABLED - { // Front - const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; - const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_0, isubstep, m1, m2, dt); - } - { // Back - const int3 m1 = (int3){NGHOST, NGHOST, nn.z}; - const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_1, isubstep, m1, m2, dt); - } - { // Bottom - const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST}; - const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_2, isubstep, m1, m2, dt); - } - { // Top - const int3 m1 = (int3){NGHOST, nn.y, 2 * NGHOST}; - const int3 m2 = m1 + (int3){nn.x, NGHOST, nn.z - 2 * NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_3, isubstep, m1, m2, dt); - } - { // Left - const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_4, isubstep, m1, m2, dt); - } - { // Right - const int3 m1 = (int3){nn.x, 2 * NGHOST, 2 * NGHOST}; - const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; - acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); - } -#endif // MPI_COMPUTE_ENABLED - acDeviceSwapBuffers(device); - acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done - //////////////////////////////////////////// - } - - return AC_SUCCESS; -} - AcResult acGridPeriodicBoundconds(const Stream stream) { From 88f99c12e4d3a035bcc53db630adc3c8eeb4db52 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 24 Jun 2020 15:20:43 +0300 Subject: [PATCH 08/10] Fixed #fi -> #endif --- src/core/device.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/device.cc b/src/core/device.cc index 688ed89..181e802 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -666,7 +666,7 @@ acCreatePackedData(const int3 dims) ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes)); // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly // slower than pinned (38 ms vs. 125 ms) - #fi // USE_CUDA_DRIVER_PINNING + #endif // USE_CUDA_DRIVER_PINNING return data; } From 3c3b2a188593f8b2900012788b46f2b3f50af451 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 24 Jun 2020 15:35:19 +0300 Subject: [PATCH 09/10] Reverted the default settings to what they were before merge. Note: LFORCING (1) is potentially not tested properly, TODO recheck. --- CMakeLists.txt | 4 ++-- acc/mhd_solver/stencil_kernel.ac | 4 ++-- config/astaroth.conf | 6 +++--- src/utils/modelsolver.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 04100bc..682be55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ project(astaroth C CXX CUDA) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) ## Project-wide compilation flags -set(COMMON_FLAGS "-mavx -Wall -Wextra -Wdouble-promotion -Wfloat-conversion -Wshadow") +set(COMMON_FLAGS "-mavx -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") set(CMAKE_C_STANDARD 11) @@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 11) find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_... cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}") -set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Wdouble-promotion,-Wfloat-conversion,-Wshadow") +set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}") diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac index 0f37ea5..905eb65 100644 --- a/acc/mhd_solver/stencil_kernel.ac +++ b/acc/mhd_solver/stencil_kernel.ac @@ -5,8 +5,8 @@ #define LMAGNETIC (1) #define LENTROPY (1) #define LTEMPERATURE (0) -#define LFORCING (0) -#define LUPWD (0) +#define LFORCING (1) +#define LUPWD (1) #define LSINK (0) #define AC_THERMAL_CONDUCTIVITY (0.001) // TODO: make an actual config parameter diff --git a/config/astaroth.conf b/config/astaroth.conf index 83e93d9..190948b 100644 --- a/config/astaroth.conf +++ b/config/astaroth.conf @@ -5,9 +5,9 @@ * "Compile-time" params * ============================================================================= */ -AC_nx = 256 -AC_ny = 256 -AC_nz = 256 +AC_nx = 128 +AC_ny = 128 +AC_nz = 128 AC_dsx = 0.04908738521 AC_dsy = 0.04908738521 diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c index e482bd9..d9446eb 100644 --- a/src/utils/modelsolver.c +++ b/src/utils/modelsolver.c @@ -38,8 +38,8 @@ #define LMAGNETIC (1) #define LENTROPY (1) #define LTEMPERATURE (0) -#define LFORCING (0) -#define LUPWD (0) +#define LFORCING (1) +#define LUPWD (1) #define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter typedef AcReal Scalar; From 0d1c5b3911f502c3fe8a7d91b344a81921a2d822 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 24 Jun 2020 15:56:30 +0300 Subject: [PATCH 10/10] Autoformatted --- samples/bwtest/main.c | 47 ++++++++++++-------- src/core/device.cc | 21 ++++----- src/core/kernels/kernels.cu | 25 ++++++----- src/core/kernels/reductions.cuh | 5 ++- src/utils/modelsolver.c | 78 ++++++++++++++++++++++----------- 5 files changed, 108 insertions(+), 68 deletions(-) diff --git a/samples/bwtest/main.c b/samples/bwtest/main.c index 9dd25d9..35d98d5 100644 --- a/samples/bwtest/main.c +++ b/samples/bwtest/main.c @@ -6,15 +6,21 @@ #include -#include #include // CUDA driver API +#include #include "timer_hires.h" // From src/common //#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes #define BLOCK_SIZE (256 * 256 * 3 * 8 * 8) -#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }} +#define errchk(x) \ + { \ + if (!(x)) { \ + fprintf(stderr, "errchk(%s) failed", #x); \ + assert(x); \ + } \ + } /* Findings: @@ -57,17 +63,18 @@ allocDevice(const size_t bytes) static uint8_t* allocDevicePinned(const size_t bytes) { - #define USE_CUDA_DRIVER_PINNING (1) - #if USE_CUDA_DRIVER_PINNING +#define USE_CUDA_DRIVER_PINNING (1) +#if USE_CUDA_DRIVER_PINNING uint8_t* arr = allocDevice(bytes); unsigned int flag = 1; - CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)arr); + CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)arr); errchk(retval == CUDA_SUCCESS); return arr; - #else +#else uint8_t* arr; // Standard (20 GiB/s internode, 85 GiB/s intranode) // const cudaError_t retval = cudaMalloc((void**)&arr, bytes); @@ -77,7 +84,7 @@ allocDevicePinned(const size_t bytes) const cudaError_t retval = cudaMallocHost((void**)&arr, bytes); errchk(retval == cudaSuccess); return arr; - #endif +#endif } /* @@ -267,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst) cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice); } - static void sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst) { @@ -327,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_ MPI_Barrier(MPI_COMM_WORLD); } - static void -measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst, - uint8_t* hsrc, uint8_t* ddst) +measurebw2(const char* msg, const size_t bytes, + void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst, + uint8_t* hsrc, uint8_t* ddst) { const size_t num_samples = 100; @@ -414,8 +420,8 @@ main(void) measurebw("Bidirectional bandwidth, twoway (Host)", // 2 * BLOCK_SIZE, sendrecv_twoway, src, dst); measurebw("Bidirectional bandwidth, async multiple (Host)", // - 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); - //measurebw("Bidirectional bandwidth, async multiple parallel (Host)", // + 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); + // measurebw("Bidirectional bandwidth, async multiple parallel (Host)", // // 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst); freeHost(src); @@ -434,11 +440,12 @@ main(void) measurebw("Bidirectional bandwidth, twoway (Device)", // 2 * BLOCK_SIZE, sendrecv_twoway, src, dst); measurebw("Bidirectional bandwidth, async multiple (Device)", // - 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); - //measurebw("Bidirectional bandwidth, async multiple parallel (Device)", // + 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); + // measurebw("Bidirectional bandwidth, async multiple parallel (Device)", // // 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst); measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", // - 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst); + 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, + dst); freeDevice(src); freeDevice(dst); @@ -456,7 +463,7 @@ main(void) measurebw("Bidirectional bandwidth, twoway (Device, pinned)", // 2 * BLOCK_SIZE, sendrecv_twoway, src, dst); measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", // - 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); + 2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst); freeDevice(src); freeDevice(dst); @@ -472,7 +479,8 @@ main(void) measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst); measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst); - measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst); + measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, + ddst); freeDevice(dsrc); freeDevice(ddst); @@ -490,7 +498,8 @@ main(void) measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst); measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst); - measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst); + measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, + hsrc, ddst); freeDevice(dsrc); freeDevice(ddst); diff --git a/src/core/device.cc b/src/core/device.cc index 181e802..8cda677 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -16,7 +16,7 @@ #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) #define MPI_INCL_CORNERS (0) -#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory +#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost #include // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set) @@ -656,17 +656,18 @@ acCreatePackedData(const int3 dims) const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes)); - #if MPI_USE_CUDA_DRIVER_PINNING - ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes)); +#if MPI_USE_CUDA_DRIVER_PINNING + ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes)); - unsigned int flag = 1; - CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)data.data_pinned); - ERRCHK_ALWAYS(retval == CUDA_SUCCESS); - #else + unsigned int flag = 1; + CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, + (CUdeviceptr)data.data_pinned); + ERRCHK_ALWAYS(retval == CUDA_SUCCESS); +#else ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes)); - // ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly - // slower than pinned (38 ms vs. 125 ms) - #endif // USE_CUDA_DRIVER_PINNING +// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly +// slower than pinned (38 ms vs. 125 ms) +#endif // USE_CUDA_DRIVER_PINNING return data; } diff --git a/src/core/kernels/kernels.cu b/src/core/kernels/kernels.cu index 3c59486..2878f38 100644 --- a/src/core/kernels/kernels.cu +++ b/src/core/kernels/kernels.cu @@ -75,17 +75,20 @@ exp(const acComplex& val) { return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y)); } -static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b) +static __device__ inline acComplex +operator*(const AcReal& a, const acComplex& b) { return (acComplex){a * b.x, a * b.y}; } -static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a) +static __device__ inline acComplex +operator*(const acComplex& b, const AcReal& a) { return (acComplex){a * b.x, a * b.y}; } -static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b) +static __device__ inline acComplex +operator*(const acComplex& a, const acComplex& b) { return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x}; } @@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info; ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, device->streams[stream])); + cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info; ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, device->streams[stream])); + cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info; ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset, - cudaMemcpyHostToDevice, device->streams[stream])); + cudaMemcpyHostToDevice, device->streams[stream])); return AC_SUCCESS; } @@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa } if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) { - fprintf( - stderr, - "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n", - value.x, value.y, value.z, int3param_names[param]); + fprintf(stderr, + "WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. " + "Skipping.\n", + value.x, value.y, value.z, int3param_names[param]); return AC_FAILURE; } @@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device) { cudaSetDevice(device->id); - // clang-format off +// clang-format off // Scalar #define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE); AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM) diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh index 8877e7e..1f40df3 100644 --- a/src/core/kernels/reductions.cuh +++ b/src/core/kernels/reductions.cuh @@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz); assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz); - dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter( - src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]); + dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)], + src1[IDX(src_idx)], + src2[IDX(src_idx)]); } template diff --git a/src/utils/modelsolver.c b/src/utils/modelsolver.c index d9446eb..4c2edcf 100644 --- a/src/utils/modelsolver.c +++ b/src/utils/modelsolver.c @@ -103,11 +103,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds) #elif STENCIL_ORDER == 4 const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)}; #elif STENCIL_ORDER == 6 - const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0), - (Scalar)(1.0 / 60.0)}; + const Scalar coefficients[] = { + 0, + (Scalar)(3.0 / 4.0), + (Scalar)(-3.0 / 20.0), + (Scalar)(1.0 / 60.0), + }; #elif STENCIL_ORDER == 8 - const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), - (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)}; + const Scalar coefficients[] = { + 0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0), + }; #endif #define MID (STENCIL_ORDER / 2) @@ -126,15 +131,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds) #if STENCIL_ORDER == 2 const Scalar coefficients[] = {-2, 1}; #elif STENCIL_ORDER == 4 - const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0), - (Scalar)(-1.0 / 12.0)}; + const Scalar coefficients[] = { + (Scalar)(-5.0 / 2.0), + (Scalar)(4.0 / 3.0), + (Scalar)(-1.0 / 12.0), + }; #elif STENCIL_ORDER == 6 - const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0), - (Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)}; + const Scalar coefficients[] = { + (Scalar)(-49.0 / 18.0), + (Scalar)(3.0 / 2.0), + (Scalar)(-3.0 / 20.0), + (Scalar)(1.0 / 90.0), + }; #elif STENCIL_ORDER == 8 - const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), - (Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0), - (Scalar)(-1.0 / 560.0)}; + const Scalar coefficients[] = { + (Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), (Scalar)(-1.0 / 5.0), + (Scalar)(8.0 / 315.0), (Scalar)(-1.0 / 560.0), + }; #endif #define MID (STENCIL_ORDER / 2) @@ -156,16 +169,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)}; #elif STENCIL_ORDER == 4 const Scalar coefficients[] = { - 0, (Scalar)(1.0 / 32.0), - (Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders + 0, + (Scalar)(1.0 / 32.0), + (Scalar)(1.0 / 64.0), + }; // TODO correct coefficients, these are just placeholders #elif STENCIL_ORDER == 6 const Scalar fac = ((Scalar)(1. / 720.)); - const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac, - (Scalar)(2.0) * fac}; + const Scalar coefficients[] = { + 0 * fac, + (Scalar)(270.0) * fac, + (Scalar)(-27.0) * fac, + (Scalar)(2.0) * fac, + }; #elif STENCIL_ORDER == 8 const Scalar fac = ((Scalar)(1. / 20160.)); - const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac, - (Scalar)(128.) * fac, (Scalar)(-9.) * fac}; + const Scalar coefficients[] = { + 0 * fac, + (Scalar)(8064.) * fac, + (Scalar)(-1008.) * fac, + (Scalar)(128.) * fac, + (Scalar)(-9.) * fac, + }; #endif #define MID (STENCIL_ORDER / 2) @@ -207,14 +231,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr) Scalar pencil_a[STENCIL_ORDER + 1]; //#pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2, - k)]; + pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, // + j + offset - STENCIL_ORDER / 2, k)]; Scalar pencil_b[STENCIL_ORDER + 1]; //#pragma unroll for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset) - pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset, - k)]; + pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, // + j + STENCIL_ORDER / 2 - offset, k)]; return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy)); } @@ -539,7 +563,8 @@ gradient_of_divergence(const VectorData vec) return (Vector){ hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2], hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2], - hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]}; + hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2], + }; } // Takes uu gradients and returns S @@ -805,10 +830,11 @@ forcing(int3 globalVertexIdx, Scalar dt) getInt(AC_ny) * getReal(AC_dsy), getInt(AC_nz) * getReal(AC_dsz)}; // source (origin) (void)a; // WARNING: not used - Vector xx = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx), - (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy), - (globalVertexIdx.z - getInt(AC_nz_min)) * - getReal(AC_dsz)}; // sink (current index) + Vector xx = (Vector){ + (globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx), + (globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy), + (globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz), + }; // sink (current index) const Scalar cs2 = getReal(AC_cs2_sound); const Scalar cs = sqrt(cs2);