Added the (hopefully final) basic test case used for the benchmarks
This commit is contained in:
@@ -56,11 +56,12 @@ morton3D(const uint64_t pid)
|
|||||||
{
|
{
|
||||||
uint64_t i, j, k;
|
uint64_t i, j, k;
|
||||||
i = j = k = 0;
|
i = j = k = 0;
|
||||||
|
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
const uint64_t mask = 0x1l << 3 * bit;
|
const uint64_t mask = 0x1l << 3 * bit;
|
||||||
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||||
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
||||||
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (uint3_64){i, j, k};
|
return (uint3_64){i, j, k};
|
||||||
@@ -174,7 +175,7 @@ main(int argc, char** argv)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Percentiles
|
// Percentiles
|
||||||
const size_t num_iters = 100;
|
const size_t num_iters = 1000;
|
||||||
const double nth_percentile = 0.90;
|
const double nth_percentile = 0.90;
|
||||||
std::vector<double> results; // ms
|
std::vector<double> results; // ms
|
||||||
results.reserve(num_iters);
|
results.reserve(num_iters);
|
||||||
|
@@ -29,6 +29,7 @@ main(void)
|
|||||||
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
|
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
|
||||||
fprintf(fp, "#SBATCH -n %d\n", nprocs);
|
fprintf(fp, "#SBATCH -n %d\n", nprocs);
|
||||||
fprintf(fp, "#SBATCH -N %d\n", nodes);
|
fprintf(fp, "#SBATCH -N %d\n", nodes);
|
||||||
|
fprintf(fp, "#SBATCH --exclusive\n");
|
||||||
|
|
||||||
// Modules
|
// Modules
|
||||||
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
||||||
@@ -37,13 +38,13 @@ main(void)
|
|||||||
// Profile and run
|
// Profile and run
|
||||||
fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
||||||
|
|
||||||
const int nx = 1792;
|
const int nx = 256; // max size 1792;
|
||||||
const int ny = nx;
|
const int ny = nx;
|
||||||
const int nz = nx;
|
const int nz = nx;
|
||||||
fprintf(fp,
|
fprintf(fp,
|
||||||
"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
|
//"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
|
||||||
"%d\n",
|
//"%d\n",
|
||||||
nprocs, nx, ny, nz);
|
"srun ./benchmark %d %d %d\n", nx, ny, nz);
|
||||||
|
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
|
@@ -527,6 +527,15 @@ morton3D(const uint64_t pid)
|
|||||||
i = j = k = 0;
|
i = j = k = 0;
|
||||||
|
|
||||||
if (DECOMPOSITION_AXES == 3) {
|
if (DECOMPOSITION_AXES == 3) {
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << 3 * bit;
|
||||||
|
k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||||
|
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
||||||
|
i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
else if (DECOMPOSITION_AXES == 3) {
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
const uint64_t mask = 0x1l << 3 * bit;
|
const uint64_t mask = 0x1l << 3 * bit;
|
||||||
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||||
@@ -534,18 +543,19 @@ morton3D(const uint64_t pid)
|
|||||||
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
// Just a quick copy/paste for other decomp dims
|
// Just a quick copy/paste for other decomp dims
|
||||||
else if (DECOMPOSITION_AXES == 2) {
|
else if (DECOMPOSITION_AXES == 2) {
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
const uint64_t mask = 0x1l << 2 * bit;
|
const uint64_t mask = 0x1l << 2 * bit;
|
||||||
i |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
|
j |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
|
||||||
j |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
|
k |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (DECOMPOSITION_AXES == 1) {
|
else if (DECOMPOSITION_AXES == 1) {
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
const uint64_t mask = 0x1l << 1 * bit;
|
const uint64_t mask = 0x1l << 1 * bit;
|
||||||
i |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
|
k |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@@ -562,24 +572,33 @@ morton1D(const uint3_64 pid)
|
|||||||
uint64_t i = 0;
|
uint64_t i = 0;
|
||||||
|
|
||||||
if (DECOMPOSITION_AXES == 3) {
|
if (DECOMPOSITION_AXES == 3) {
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << bit;
|
||||||
|
i |= ((pid.z & mask) << 0) << 2 * bit;
|
||||||
|
i |= ((pid.y & mask) << 1) << 2 * bit;
|
||||||
|
i |= ((pid.x & mask) << 2) << 2 * bit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
else if (DECOMPOSITION_AXES == 3) {
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
const uint64_t mask = 0x1l << bit;
|
const uint64_t mask = 0x1l << bit;
|
||||||
i |= ((pid.x & mask) << 0) << 2 * bit;
|
i |= ((pid.x & mask) << 0) << 2 * bit;
|
||||||
i |= ((pid.y & mask) << 1) << 2 * bit;
|
i |= ((pid.y & mask) << 1) << 2 * bit;
|
||||||
i |= ((pid.z & mask) << 2) << 2 * bit;
|
i |= ((pid.z & mask) << 2) << 2 * bit;
|
||||||
}
|
}
|
||||||
}
|
}*/
|
||||||
else if (DECOMPOSITION_AXES == 2) {
|
else if (DECOMPOSITION_AXES == 2) {
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
const uint64_t mask = 0x1l << bit;
|
const uint64_t mask = 0x1l << bit;
|
||||||
i |= ((pid.x & mask) << 0) << 1 * bit;
|
i |= ((pid.y & mask) << 0) << 1 * bit;
|
||||||
i |= ((pid.y & mask) << 1) << 1 * bit;
|
i |= ((pid.z & mask) << 1) << 1 * bit;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (DECOMPOSITION_AXES == 1) {
|
else if (DECOMPOSITION_AXES == 1) {
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
const uint64_t mask = 0x1l << bit;
|
const uint64_t mask = 0x1l << bit;
|
||||||
i |= ((pid.x & mask) << 0) << 0 * bit;
|
i |= ((pid.z & mask) << 0) << 0 * bit;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@@ -1204,6 +1223,8 @@ typedef struct {
|
|||||||
CommData sidexy_data;
|
CommData sidexy_data;
|
||||||
CommData sidexz_data;
|
CommData sidexz_data;
|
||||||
CommData sideyz_data;
|
CommData sideyz_data;
|
||||||
|
|
||||||
|
// int comm_cart;
|
||||||
} Grid;
|
} Grid;
|
||||||
|
|
||||||
static Grid grid = {};
|
static Grid grid = {};
|
||||||
@@ -1444,16 +1465,6 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
acPackCommData(device, sideyz_b0s, &sideyz_data);
|
acPackCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if MPI_COMPUTE_ENABLED
|
|
||||||
//////////// INNER INTEGRATION //////////////
|
|
||||||
{
|
|
||||||
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
|
||||||
const int3 m2 = nn;
|
|
||||||
acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
|
|
||||||
}
|
|
||||||
////////////////////////////////////////////
|
|
||||||
#endif // MPI_COMPUTE_ENABLED
|
|
||||||
|
|
||||||
#if MPI_COMM_ENABLED
|
#if MPI_COMM_ENABLED
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
@@ -1474,7 +1485,19 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
acTransferCommData(device, sidexy_b0s, &sidexy_data);
|
acTransferCommData(device, sidexy_b0s, &sidexy_data);
|
||||||
acTransferCommData(device, sidexz_b0s, &sidexz_data);
|
acTransferCommData(device, sidexz_b0s, &sidexz_data);
|
||||||
acTransferCommData(device, sideyz_b0s, &sideyz_data);
|
acTransferCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
|
#endif // MPI_COMM_ENABLED
|
||||||
|
|
||||||
|
#if MPI_COMPUTE_ENABLED
|
||||||
|
//////////// INNER INTEGRATION //////////////
|
||||||
|
{
|
||||||
|
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
||||||
|
const int3 m2 = nn;
|
||||||
|
acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
|
||||||
|
}
|
||||||
|
////////////////////////////////////////////
|
||||||
|
#endif // MPI_COMPUTE_ENABLED
|
||||||
|
|
||||||
|
#if MPI_COMM_ENABLED
|
||||||
// acTransferCommDataWait(corner_data); // Do not rm: required for corners
|
// acTransferCommDataWait(corner_data); // Do not rm: required for corners
|
||||||
acTransferCommDataWait(edgex_data);
|
acTransferCommDataWait(edgex_data);
|
||||||
acTransferCommDataWait(edgey_data);
|
acTransferCommDataWait(edgey_data);
|
||||||
|
@@ -134,7 +134,7 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
|
|||||||
// RK3
|
// RK3
|
||||||
dim3 best_dims(0, 0, 0);
|
dim3 best_dims(0, 0, 0);
|
||||||
float best_time = INFINITY;
|
float best_time = INFINITY;
|
||||||
const int num_iterations = 5;
|
const int num_iterations = 10;
|
||||||
|
|
||||||
for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
|
for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
|
||||||
for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
|
for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
|
||||||
|
Reference in New Issue
Block a user