Added the latest setup used for benchmarks

This commit is contained in:
jpekkila
2020-06-04 20:47:03 +03:00
parent 0d80834619
commit 17a4f31451
6 changed files with 182 additions and 27 deletions

View File

@@ -10,7 +10,7 @@ project(astaroth C CXX CUDA)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
## Project-wide compilation flags ## Project-wide compilation flags
set(COMMON_FLAGS "-mavx -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") set(COMMON_FLAGS "-mavx -Wall -Wextra -Wdouble-promotion -Wfloat-conversion -Wshadow")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
@@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 11)
find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_... find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}") string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow") set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")

View File

@@ -5,9 +5,9 @@
* "Compile-time" params * "Compile-time" params
* ============================================================================= * =============================================================================
*/ */
AC_nx = 512 AC_nx = 256
AC_ny = 512 AC_ny = 256
AC_nz = 512 AC_nz = 256
AC_dsx = 0.04908738521 AC_dsx = 0.04908738521
AC_dsy = 0.04908738521 AC_dsy = 0.04908738521

View File

@@ -39,8 +39,46 @@ typedef enum {
NUM_TESTS, NUM_TESTS,
} TestType; } TestType;
#include <stdint.h>
typedef struct {
uint64_t x, y, z;
} uint3_64;
static uint3_64
operator+(const uint3_64& a, const uint3_64& b)
{
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
}
static uint3_64
morton3D(const uint64_t pid)
{
uint64_t i, j, k;
i = j = k = 0;
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << 3 * bit;
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
}
return (uint3_64){i, j, k};
}
static uint3_64
decompose(const uint64_t target)
{
// This is just so beautifully elegant. Complex and efficient decomposition
// in just one line of code.
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
return p;
}
int int
main(void) main(int argc, char** argv)
{ {
MPI_Init(NULL, NULL); MPI_Init(NULL, NULL);
int nprocs, pid; int nprocs, pid;
@@ -51,9 +89,30 @@ main(void)
AcMeshInfo info; AcMeshInfo info;
acLoadConfig(AC_DEFAULT_CONFIG, &info); acLoadConfig(AC_DEFAULT_CONFIG, &info);
if (argc > 1) {
if (argc == 4) {
const int nx = atoi(argv[1]);
const int ny = atoi(argv[2]);
const int nz = atoi(argv[3]);
info.int_params[AC_nx] = nx;
info.int_params[AC_ny] = ny;
info.int_params[AC_nz] = nz;
acUpdateBuiltinParams(&info);
printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
}
else {
fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
exit(EXIT_FAILURE);
}
}
const TestType test = TEST_STRONG_SCALING; const TestType test = TEST_STRONG_SCALING;
if (test == TEST_WEAK_SCALING) if (test == TEST_WEAK_SCALING) {
info.int_params[AC_nz] *= nprocs; uint3_64 decomp = decompose(nprocs);
info.int_params[AC_nx] *= decomp.x;
info.int_params[AC_ny] *= decomp.y;
info.int_params[AC_nz] *= decomp.z;
}
/* /*
AcMesh model, candidate; AcMesh model, candidate;

View File

@@ -36,8 +36,14 @@ main(void)
// Profile and run // Profile and run
fprintf(fp, "mkdir -p profile_%d\n", nprocs); fprintf(fp, "mkdir -p profile_%d\n", nprocs);
fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
nprocs); const int nx = 1792;
const int ny = nx;
const int nz = nx;
fprintf(fp,
"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
"%d\n",
nprocs, nx, ny, nz);
fclose(fp); fclose(fp);
} }

View File

@@ -518,6 +518,78 @@ mod(const int a, const int b)
return r < 0 ? r + b : r; return r < 0 ? r + b : r;
} }
#define DECOMPOSITION_AXES (3)
static uint3_64
morton3D(const uint64_t pid)
{
uint64_t i, j, k;
i = j = k = 0;
if (DECOMPOSITION_AXES == 3) {
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << 3 * bit;
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
}
}
// Just a quick copy/paste for other decomp dims
else if (DECOMPOSITION_AXES == 2) {
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << 2 * bit;
i |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
j |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
}
}
else if (DECOMPOSITION_AXES == 1) {
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << 1 * bit;
i |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
}
}
else {
fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
ERRCHK_ALWAYS(0);
}
return (uint3_64){i, j, k};
}
static uint64_t
morton1D(const uint3_64 pid)
{
uint64_t i = 0;
if (DECOMPOSITION_AXES == 3) {
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << bit;
i |= ((pid.x & mask) << 0) << 2 * bit;
i |= ((pid.y & mask) << 1) << 2 * bit;
i |= ((pid.z & mask) << 2) << 2 * bit;
}
}
else if (DECOMPOSITION_AXES == 2) {
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << bit;
i |= ((pid.x & mask) << 0) << 1 * bit;
i |= ((pid.y & mask) << 1) << 1 * bit;
}
}
else if (DECOMPOSITION_AXES == 1) {
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << bit;
i |= ((pid.x & mask) << 0) << 0 * bit;
}
}
else {
fprintf(stderr, "Invalid DECOMPOSITION_AXES\n");
ERRCHK_ALWAYS(0);
}
return i;
}
/*
static uint3_64 static uint3_64
morton3D(const uint64_t pid) morton3D(const uint64_t pid)
{ {
@@ -545,6 +617,7 @@ morton1D(const uint3_64 pid)
} }
return i; return i;
} }
*/
static uint3_64 static uint3_64
decompose(const uint64_t target) decompose(const uint64_t target)
@@ -1277,6 +1350,9 @@ acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
return AC_SUCCESS; return AC_SUCCESS;
} }
#define MPI_COMPUTE_ENABLED (1)
#define MPI_COMM_ENABLED (1)
AcResult AcResult
acGridIntegrate(const Stream stream, const AcReal dt) acGridIntegrate(const Stream stream, const AcReal dt)
{ {
@@ -1285,7 +1361,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
const Device device = grid.device; const Device device = grid.device;
const int3 nn = grid.nn; const int3 nn = grid.nn;
//CommData corner_data = grid.corner_data; // Do not rm: required for corners // CommData corner_data = grid.corner_data; // Do not rm: required for corners
CommData edgex_data = grid.edgex_data; CommData edgex_data = grid.edgex_data;
CommData edgey_data = grid.edgey_data; CommData edgey_data = grid.edgey_data;
CommData edgez_data = grid.edgez_data; CommData edgez_data = grid.edgez_data;
@@ -1357,6 +1433,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
}; };
for (int isubstep = 0; isubstep < 3; ++isubstep) { for (int isubstep = 0; isubstep < 3; ++isubstep) {
#if MPI_COMM_ENABLED
// acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners // acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
acPackCommData(device, edgex_b0s, &edgex_data); acPackCommData(device, edgex_b0s, &edgex_data);
acPackCommData(device, edgey_b0s, &edgey_data); acPackCommData(device, edgey_b0s, &edgey_data);
@@ -1364,15 +1442,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
acPackCommData(device, sidexy_b0s, &sidexy_data); acPackCommData(device, sidexy_b0s, &sidexy_data);
acPackCommData(device, sidexz_b0s, &sidexz_data); acPackCommData(device, sidexz_b0s, &sidexz_data);
acPackCommData(device, sideyz_b0s, &sideyz_data); acPackCommData(device, sideyz_b0s, &sideyz_data);
#endif
#if MPI_COMPUTE_ENABLED
//////////// INNER INTEGRATION ////////////// //////////// INNER INTEGRATION //////////////
{ {
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST}; const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = nn; const int3 m2 = nn;
acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt); acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
} }
//////////////////////////////////////////// ////////////////////////////////////////////
#endif // MPI_COMPUTE_ENABLED
#if MPI_COMM_ENABLED
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
#if MPI_GPUDIRECT_DISABLED #if MPI_GPUDIRECT_DISABLED
@@ -1436,6 +1518,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
acSyncCommData(sidexy_data); acSyncCommData(sidexy_data);
acSyncCommData(sidexz_data); acSyncCommData(sidexz_data);
acSyncCommData(sideyz_data); acSyncCommData(sideyz_data);
#endif // MPI_COMM_ENABLED
#if MPI_COMPUTE_ENABLED
{ // Front { // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST}; const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST}; const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
@@ -1466,6 +1550,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST}; const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt); acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
} }
#endif // MPI_COMPUTE_ENABLED
acDeviceSwapBuffers(device); acDeviceSwapBuffers(device);
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
//////////////////////////////////////////// ////////////////////////////////////////////

View File

@@ -41,10 +41,12 @@ static __device__ __forceinline__ AcReal3
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current, rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
const AcReal3 rate_of_change, const AcReal dt) const AcReal3 rate_of_change, const AcReal dt)
{ {
return (AcReal3){ return (AcReal3){rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x,
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt), dt),
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt), rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y,
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)}; dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z,
dt)};
} }
#define rk3(state_previous, state_current, rate_of_change, dt) \ #define rk3(state_previous, state_current, rate_of_change, dt) \
@@ -132,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
// RK3 // RK3
dim3 best_dims(0, 0, 0); dim3 best_dims(0, 0, 0);
float best_time = INFINITY; float best_time = INFINITY;
const int num_iterations = 10; const int num_iterations = 5;
for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
@@ -192,8 +194,8 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
} }
} }
#if VERBOSE_PRINTING #if VERBOSE_PRINTING
printf( printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f "
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n", "ms\n",
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations); best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
#endif #endif
/* /*
@@ -204,6 +206,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
*/ */
rk3_tpb = best_dims; rk3_tpb = best_dims;
// Failed to find valid thread block dimensions
ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0);
return AC_SUCCESS; return AC_SUCCESS;
} }