diff --git a/CMakeLists.txt b/CMakeLists.txt index 45c3a2b..04100bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ ## CMake settings # V3.9 required for first-class CUDA support # V3.17 required for the FindCUDAToolkit package -cmake_minimum_required(VERSION 3.17) +cmake_minimum_required(VERSION 3.17) find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH) find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH) @@ -10,7 +10,7 @@ project(astaroth C CXX CUDA) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) ## Project-wide compilation flags -set(COMMON_FLAGS "-mavx -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") +set(COMMON_FLAGS "-mavx -Wall -Wextra -Wdouble-promotion -Wfloat-conversion -Wshadow") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") set(CMAKE_C_STANDARD 11) @@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 11) find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_... cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}") -set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow") +set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Wdouble-promotion,-Wfloat-conversion,-Wshadow") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}") diff --git a/config/astaroth.conf b/config/astaroth.conf index ccefc45..83e93d9 100644 --- a/config/astaroth.conf +++ b/config/astaroth.conf @@ -5,9 +5,9 @@ * "Compile-time" params * ============================================================================= */ -AC_nx = 512 -AC_ny = 512 -AC_nz = 512 +AC_nx = 256 +AC_ny = 256 +AC_nz = 256 AC_dsx = 0.04908738521 AC_dsy = 0.04908738521 @@ -24,11 +24,11 @@ AC_bin_steps = 1000 AC_bin_save_t = 1e666 // Set to 0 if you want to run the simulation from the beginning, or just a new -// simulation. If continuing from a saved step, specify the step number here. -AC_start_step = 0 +// simulation. If continuing from a saved step, specify the step number here. +AC_start_step = 0 // Maximum time in code units. If negative, there is no time limit -AC_max_time = -1.0 +AC_max_time = -1.0 // Hydro AC_cdt = 0.4 @@ -49,7 +49,7 @@ AC_forcing_magnitude = 1e-5 AC_kmin = 0.8 AC_kmax = 1.2 // Switches forcing off and accretion on -AC_switch_accretion = 0 +AC_switch_accretion = 0 // Entropy AC_cp_sound = 1.0 diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index 5ab4349..dd14129 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -39,8 +39,46 @@ typedef enum { NUM_TESTS, } TestType; +#include + +typedef struct { + uint64_t x, y, z; +} uint3_64; + +static uint3_64 +operator+(const uint3_64& a, const uint3_64& b) +{ + return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z}; +} + +static uint3_64 +morton3D(const uint64_t pid) +{ + uint64_t i, j, k; + i = j = k = 0; + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 3 * bit; + i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; + j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; + k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; + } + + return (uint3_64){i, j, k}; +} + +static uint3_64 +decompose(const uint64_t target) +{ + // This is just so beautifully elegant. Complex and efficient decomposition + // in just one line of code. + uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1}; + + ERRCHK_ALWAYS(p.x * p.y * p.z == target); + return p; +} + int -main(void) +main(int argc, char** argv) { MPI_Init(NULL, NULL); int nprocs, pid; @@ -51,9 +89,30 @@ main(void) AcMeshInfo info; acLoadConfig(AC_DEFAULT_CONFIG, &info); + if (argc > 1) { + if (argc == 4) { + const int nx = atoi(argv[1]); + const int ny = atoi(argv[2]); + const int nz = atoi(argv[3]); + info.int_params[AC_nx] = nx; + info.int_params[AC_ny] = ny; + info.int_params[AC_nz] = nz; + acUpdateBuiltinParams(&info); + printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz); + } + else { + fprintf(stderr, "Could not parse arguments. Usage: ./benchmark .\n"); + exit(EXIT_FAILURE); + } + } + const TestType test = TEST_STRONG_SCALING; - if (test == TEST_WEAK_SCALING) - info.int_params[AC_nz] *= nprocs; + if (test == TEST_WEAK_SCALING) { + uint3_64 decomp = decompose(nprocs); + info.int_params[AC_nx] *= decomp.x; + info.int_params[AC_ny] *= decomp.y; + info.int_params[AC_nz] *= decomp.z; + } /* AcMesh model, candidate; diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index 8d35ae9..6f160b3 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -36,8 +36,14 @@ main(void) // Profile and run fprintf(fp, "mkdir -p profile_%d\n", nprocs); - fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n", - nprocs); + + const int nx = 1792; + const int ny = nx; + const int nz = nx; + fprintf(fp, + "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " + "%d\n", + nprocs, nx, ny, nz); fclose(fp); } diff --git a/src/core/device.cc b/src/core/device.cc index f473fc2..1e070cb 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -518,6 +518,78 @@ mod(const int a, const int b) return r < 0 ? r + b : r; } +#define DECOMPOSITION_AXES (3) + +static uint3_64 +morton3D(const uint64_t pid) +{ + uint64_t i, j, k; + i = j = k = 0; + + if (DECOMPOSITION_AXES == 3) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 3 * bit; + i |= ((pid & (mask << 0)) >> 2 * bit) >> 0; + j |= ((pid & (mask << 1)) >> 2 * bit) >> 1; + k |= ((pid & (mask << 2)) >> 2 * bit) >> 2; + } + } + // Just a quick copy/paste for other decomp dims + else if (DECOMPOSITION_AXES == 2) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 2 * bit; + i |= ((pid & (mask << 0)) >> 1 * bit) >> 0; + j |= ((pid & (mask << 1)) >> 1 * bit) >> 1; + } + } + else if (DECOMPOSITION_AXES == 1) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << 1 * bit; + i |= ((pid & (mask << 0)) >> 0 * bit) >> 0; + } + } + else { + fprintf(stderr, "Invalid DECOMPOSITION_AXES\n"); + ERRCHK_ALWAYS(0); + } + + return (uint3_64){i, j, k}; +} + +static uint64_t +morton1D(const uint3_64 pid) +{ + uint64_t i = 0; + + if (DECOMPOSITION_AXES == 3) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << bit; + i |= ((pid.x & mask) << 0) << 2 * bit; + i |= ((pid.y & mask) << 1) << 2 * bit; + i |= ((pid.z & mask) << 2) << 2 * bit; + } + } + else if (DECOMPOSITION_AXES == 2) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << bit; + i |= ((pid.x & mask) << 0) << 1 * bit; + i |= ((pid.y & mask) << 1) << 1 * bit; + } + } + else if (DECOMPOSITION_AXES == 1) { + for (int bit = 0; bit <= 21; ++bit) { + const uint64_t mask = 0x1l << bit; + i |= ((pid.x & mask) << 0) << 0 * bit; + } + } + else { + fprintf(stderr, "Invalid DECOMPOSITION_AXES\n"); + ERRCHK_ALWAYS(0); + } + + return i; +} +/* static uint3_64 morton3D(const uint64_t pid) { @@ -545,6 +617,7 @@ morton1D(const uint3_64 pid) } return i; } +*/ static uint3_64 decompose(const uint64_t target) @@ -1277,15 +1350,18 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh) return AC_SUCCESS; } +#define MPI_COMPUTE_ENABLED (1) +#define MPI_COMM_ENABLED (1) + AcResult acGridIntegrate(const Stream stream, const AcReal dt) { ERRCHK(grid.initialized); // acGridSynchronizeStream(stream); - const Device device = grid.device; - const int3 nn = grid.nn; - //CommData corner_data = grid.corner_data; // Do not rm: required for corners + const Device device = grid.device; + const int3 nn = grid.nn; + // CommData corner_data = grid.corner_data; // Do not rm: required for corners CommData edgex_data = grid.edgex_data; CommData edgey_data = grid.edgey_data; CommData edgez_data = grid.edgez_data; @@ -1357,6 +1433,8 @@ acGridIntegrate(const Stream stream, const AcReal dt) }; for (int isubstep = 0; isubstep < 3; ++isubstep) { + +#if MPI_COMM_ENABLED // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners acPackCommData(device, edgex_b0s, &edgex_data); acPackCommData(device, edgey_b0s, &edgey_data); @@ -1364,15 +1442,19 @@ acGridIntegrate(const Stream stream, const AcReal dt) acPackCommData(device, sidexy_b0s, &sidexy_data); acPackCommData(device, sidexz_b0s, &sidexz_data); acPackCommData(device, sideyz_b0s, &sideyz_data); +#endif +#if MPI_COMPUTE_ENABLED //////////// INNER INTEGRATION ////////////// { const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 m2 = nn; acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); } - //////////////////////////////////////////// +//////////////////////////////////////////// +#endif // MPI_COMPUTE_ENABLED +#if MPI_COMM_ENABLED MPI_Barrier(MPI_COMM_WORLD); #if MPI_GPUDIRECT_DISABLED @@ -1436,6 +1518,8 @@ acGridIntegrate(const Stream stream, const AcReal dt) acSyncCommData(sidexy_data); acSyncCommData(sidexz_data); acSyncCommData(sideyz_data); +#endif // MPI_COMM_ENABLED +#if MPI_COMPUTE_ENABLED { // Front const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; @@ -1466,6 +1550,7 @@ acGridIntegrate(const Stream stream, const AcReal dt) const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); } +#endif // MPI_COMPUTE_ENABLED acDeviceSwapBuffers(device); acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done //////////////////////////////////////////// diff --git a/src/core/kernels/integration.cuh b/src/core/kernels/integration.cuh index 8d66fd2..97326ad 100644 --- a/src/core/kernels/integration.cuh +++ b/src/core/kernels/integration.cuh @@ -41,10 +41,12 @@ static __device__ __forceinline__ AcReal3 rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current, const AcReal3 rate_of_change, const AcReal dt) { - return (AcReal3){ - rk3_integrate(state_previous.x, state_current.x, rate_of_change.x, dt), - rk3_integrate(state_previous.y, state_current.y, rate_of_change.y, dt), - rk3_integrate(state_previous.z, state_current.z, rate_of_change.z, dt)}; + return (AcReal3){rk3_integrate(state_previous.x, state_current.x, rate_of_change.x, + dt), + rk3_integrate(state_previous.y, state_current.y, rate_of_change.y, + dt), + rk3_integrate(state_previous.z, state_current.z, rate_of_change.z, + dt)}; } #define rk3(state_previous, state_current, rate_of_change, dt) \ @@ -132,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr // RK3 dim3 best_dims(0, 0, 0); float best_time = INFINITY; - const int num_iterations = 10; + const int num_iterations = 5; for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { @@ -192,9 +194,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr } } #if VERBOSE_PRINTING - printf( - "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n", - best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); + printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f " + "ms\n", + best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); #endif /* FILE* fp = fopen("../config/rk3_tbdims.cuh", "w"); @@ -204,6 +206,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr */ rk3_tpb = best_dims; + + // Failed to find valid thread block dimensions + ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0); return AC_SUCCESS; }