Merge branch 'master' into alt_bcond_2020_09
This commit is contained in:
@@ -45,6 +45,7 @@ uniform Scalar AC_zorig;
|
|||||||
uniform Scalar AC_unit_density;
|
uniform Scalar AC_unit_density;
|
||||||
uniform Scalar AC_unit_velocity;
|
uniform Scalar AC_unit_velocity;
|
||||||
uniform Scalar AC_unit_length;
|
uniform Scalar AC_unit_length;
|
||||||
|
uniform Scalar AC_unit_magnetic;
|
||||||
// properties of gravitating star
|
// properties of gravitating star
|
||||||
uniform Scalar AC_star_pos_x;
|
uniform Scalar AC_star_pos_x;
|
||||||
uniform Scalar AC_star_pos_y;
|
uniform Scalar AC_star_pos_y;
|
||||||
|
@@ -716,7 +716,7 @@ external acdevicesynchronizestream
|
|||||||
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter);
|
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter);
|
||||||
|
|
||||||
// Streams
|
// Streams
|
||||||
const size_t nstreams = 20;
|
const size_t nstreams = 32;
|
||||||
for (size_t i = 0; i < nstreams; ++i) {
|
for (size_t i = 0; i < nstreams; ++i) {
|
||||||
fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i);
|
fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i);
|
||||||
fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i);
|
fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i);
|
||||||
|
@@ -298,6 +298,9 @@ Resets all devices on the current grid.
|
|||||||
*/
|
*/
|
||||||
AcResult acGridQuit(void);
|
AcResult acGridQuit(void);
|
||||||
|
|
||||||
|
/** Randomizes the local mesh */
|
||||||
|
AcResult acGridRandomize(void);
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult acGridSynchronizeStream(const Stream stream);
|
AcResult acGridSynchronizeStream(const Stream stream);
|
||||||
|
|
||||||
@@ -633,6 +636,9 @@ AcResult acUpdateBuiltinParams(AcMeshInfo* config);
|
|||||||
/** Creates a mesh stored in host memory */
|
/** Creates a mesh stored in host memory */
|
||||||
AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
|
AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
|
||||||
|
|
||||||
|
/** Randomizes a host mesh */
|
||||||
|
AcResult acMeshRandomize(AcMesh* mesh);
|
||||||
|
|
||||||
/** Destroys a mesh stored in host memory */
|
/** Destroys a mesh stored in host memory */
|
||||||
AcResult acMeshDestroy(AcMesh* mesh);
|
AcResult acMeshDestroy(AcMesh* mesh);
|
||||||
|
|
||||||
|
@@ -50,9 +50,6 @@ AcResult acVertexBufferSet(const VertexBufferHandle handle, const AcReal value,
|
|||||||
/** */
|
/** */
|
||||||
AcResult acMeshSet(const AcReal value, AcMesh* mesh);
|
AcResult acMeshSet(const AcReal value, AcMesh* mesh);
|
||||||
|
|
||||||
/** */
|
|
||||||
AcResult acMeshRandomize(AcMesh* mesh);
|
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
AcResult acMeshApplyPeriodicBounds(AcMesh* mesh);
|
AcResult acMeshApplyPeriodicBounds(AcMesh* mesh);
|
||||||
|
|
||||||
|
@@ -107,7 +107,7 @@ main(int argc, char** argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const TestType test = TEST_STRONG_SCALING;
|
const TestType test = TEST_WEAK_SCALING;
|
||||||
if (test == TEST_WEAK_SCALING) {
|
if (test == TEST_WEAK_SCALING) {
|
||||||
uint3_64 decomp = decompose(nprocs);
|
uint3_64 decomp = decompose(nprocs);
|
||||||
info.int_params[AC_nx] *= decomp.x;
|
info.int_params[AC_nx] *= decomp.x;
|
||||||
@@ -126,10 +126,15 @@ main(int argc, char** argv)
|
|||||||
|
|
||||||
// GPU alloc & compute
|
// GPU alloc & compute
|
||||||
acGridInit(info);
|
acGridInit(info);
|
||||||
|
acGridRandomize();
|
||||||
|
|
||||||
|
/*
|
||||||
AcMesh model;
|
AcMesh model;
|
||||||
acMeshCreate(info, &model);
|
acMeshCreate(info, &model);
|
||||||
acMeshRandomize(&model);
|
acMeshRandomize(&model);
|
||||||
acGridLoadMesh(STREAM_DEFAULT, model);
|
acGridLoadMesh(STREAM_DEFAULT, model);
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
acGridLoadMesh(STREAM_DEFAULT, model);
|
acGridLoadMesh(STREAM_DEFAULT, model);
|
||||||
|
|
||||||
@@ -154,7 +159,7 @@ main(int argc, char** argv)
|
|||||||
}*/
|
}*/
|
||||||
|
|
||||||
// Percentiles
|
// Percentiles
|
||||||
const size_t num_iters = 1000;
|
const size_t num_iters = 100;
|
||||||
const double nth_percentile = 0.90;
|
const double nth_percentile = 0.90;
|
||||||
std::vector<double> results; // ms
|
std::vector<double> results; // ms
|
||||||
results.reserve(num_iters);
|
results.reserve(num_iters);
|
||||||
|
@@ -17,32 +17,38 @@ main(void)
|
|||||||
|
|
||||||
// Boilerplate
|
// Boilerplate
|
||||||
fprintf(fp, "#!/bin/bash\n");
|
fprintf(fp, "#!/bin/bash\n");
|
||||||
fprintf(fp, "#BATCH --job-name=astaroth\n");
|
fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK
|
||||||
fprintf(fp, "#SBATCH --account=project_2000403\n");
|
fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK
|
||||||
fprintf(fp, "#SBATCH --time=03:00:00\n");
|
fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK
|
||||||
fprintf(fp, "#SBATCH --mem=32000\n");
|
fprintf(fp, "#SBATCH --mem=0\n"); // OK
|
||||||
fprintf(fp, "#SBATCH --partition=gpu\n");
|
fprintf(fp, "#SBATCH --partition=gpu\n"); // OK
|
||||||
|
fprintf(fp, "#SBATCH --exclusive\n"); // OK
|
||||||
|
fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK
|
||||||
fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
|
fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs);
|
||||||
|
// HACK: exclude misconfigured nodes on Puhti
|
||||||
|
fprintf(fp, "#SBATCH -x "
|
||||||
|
"r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n");
|
||||||
// fprintf(fp, "#SBATCH --cpus-per-task=10\n");
|
// fprintf(fp, "#SBATCH --cpus-per-task=10\n");
|
||||||
|
|
||||||
// nprocs, nodes, gpus
|
// nprocs, nodes, gpus
|
||||||
const int max_gpus_per_node = 4;
|
const int max_gpus_per_node = 4;
|
||||||
const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
|
const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
|
||||||
const int nodes = (int)ceil((double)nprocs / max_gpus_per_node);
|
const int nodes = (int)ceil((double)nprocs / max_gpus_per_node);
|
||||||
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
|
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK
|
||||||
fprintf(fp, "#SBATCH -n %d\n", nprocs);
|
fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK
|
||||||
fprintf(fp, "#SBATCH -N %d\n", nodes);
|
fprintf(fp, "#SBATCH -N %d\n", nodes); // OK
|
||||||
// fprintf(fp, "#SBATCH --exclusive\n");
|
// fprintf(fp, "#SBATCH --exclusive\n");
|
||||||
if (nprocs >= 4)
|
// if (nprocs >= 4)
|
||||||
fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
|
// fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
|
||||||
|
|
||||||
// Modules
|
// Modules
|
||||||
// OpenMPI
|
// OpenMPI
|
||||||
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
|
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n");
|
||||||
//fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
|
// fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); //
|
||||||
//fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
|
// https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n");
|
||||||
// if (nprocs >= 32)
|
// if (nprocs >= 32)
|
||||||
// fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa
|
// fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); //
|
||||||
|
// https://www.open-mpi.org/fa
|
||||||
|
|
||||||
// HPCX
|
// HPCX
|
||||||
// fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
// fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
||||||
@@ -52,7 +58,7 @@ main(void)
|
|||||||
// fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
// fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
const int nx = 256; // max size 1792;
|
const int nx = 256; // max size 2048;
|
||||||
const int ny = nx;
|
const int ny = nx;
|
||||||
const int nz = nx;
|
const int nz = nx;
|
||||||
|
|
||||||
@@ -67,11 +73,11 @@ main(void)
|
|||||||
"benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D",
|
"benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D",
|
||||||
"benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
|
"benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm",
|
||||||
"benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024",
|
"benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024",
|
||||||
"benchmark_meshsize_1792", "benchmark_stencilord_2", "benchmark_stencilord_4",
|
"benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4",
|
||||||
"benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control",
|
"benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control",
|
||||||
"benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default",
|
"benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default",
|
||||||
"benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256",
|
"benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256",
|
||||||
"benchmark_weak_448",
|
"benchmark_weak_512",
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
|
for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) {
|
||||||
int nn = 256;
|
int nn = 256;
|
||||||
@@ -79,14 +85,32 @@ main(void)
|
|||||||
nn = 512;
|
nn = 512;
|
||||||
else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
|
else if (strcmp(files[i], "benchmark_meshsize_1024") == 0)
|
||||||
nn = 1024;
|
nn = 1024;
|
||||||
else if (strcmp(files[i], "benchmark_meshsize_1792") == 0)
|
else if (strcmp(files[i], "benchmark_meshsize_2048") == 0)
|
||||||
nn = 1792;
|
nn = 2048;
|
||||||
else if (strcmp(files[i], "benchmark_weak_128") == 0)
|
else if (strcmp(files[i], "benchmark_weak_128") == 0)
|
||||||
nn = 128;
|
nn = 128;
|
||||||
else if (strcmp(files[i], "benchmark_weak_448") == 0)
|
else if (strcmp(files[i], "benchmark_weak_512") == 0)
|
||||||
nn = 448;
|
nn = 512;
|
||||||
|
|
||||||
fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn);
|
// W/ Fredriks tunings
|
||||||
|
// (may cause Assertion `status == UCS_OK' failed errors)
|
||||||
|
// fprintf(fp,
|
||||||
|
// "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
|
||||||
|
// "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n",
|
||||||
|
// files[i], nn, nn, nn);
|
||||||
|
if (nodes >= 2) {
|
||||||
|
fprintf(fp,
|
||||||
|
"$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy "
|
||||||
|
"UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm "
|
||||||
|
"-f core.* && cd ..)\n",
|
||||||
|
files[i], nn, nn, nn);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(fp,
|
||||||
|
"$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* "
|
||||||
|
"&& cd ..)\n",
|
||||||
|
files[i], nn, nn, nn);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
|
@@ -43,7 +43,13 @@
|
|||||||
|
|
||||||
// NEED TO BE DEFINED HERE. IS NOT NOTICED BY compile_acc call.
|
// NEED TO BE DEFINED HERE. IS NOT NOTICED BY compile_acc call.
|
||||||
#define LFORCING (0)
|
#define LFORCING (0)
|
||||||
|
|
||||||
|
#ifdef VTXBUF_ACCRETION
|
||||||
|
#define LSINK (1)
|
||||||
|
#else
|
||||||
#define LSINK (0)
|
#define LSINK (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef BFIELDX
|
#ifdef BFIELDX
|
||||||
#define LBFIELD (1)
|
#define LBFIELD (1)
|
||||||
#else
|
#else
|
||||||
@@ -322,6 +328,7 @@ run_simulation(const char* config_path)
|
|||||||
// acmesh_init_to(INIT_TYPE_SIMPLE_CORE, mesh); //Initial condition for a collapse test
|
// acmesh_init_to(INIT_TYPE_SIMPLE_CORE, mesh); //Initial condition for a collapse test
|
||||||
|
|
||||||
#if LSINK
|
#if LSINK
|
||||||
|
printf("WARNING! Sink particle is under development. USE AT YOUR OWN RISK!")
|
||||||
vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh);
|
vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -388,18 +395,10 @@ run_simulation(const char* config_path)
|
|||||||
/* Step the simulation */
|
/* Step the simulation */
|
||||||
AcReal accreted_mass = 0.0;
|
AcReal accreted_mass = 0.0;
|
||||||
AcReal sink_mass = 0.0;
|
AcReal sink_mass = 0.0;
|
||||||
|
AcReal uu_freefall = 0.0;
|
||||||
AcReal dt_typical = 0.0;
|
AcReal dt_typical = 0.0;
|
||||||
int dtcounter = 0;
|
int dtcounter = 0;
|
||||||
for (int i = start_step + 1; i < max_steps; ++i) {
|
for (int i = start_step + 1; i < max_steps; ++i) {
|
||||||
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
|
||||||
#if LBFIELD
|
|
||||||
const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO);
|
|
||||||
const AcReal uref = max(umax, vAmax);
|
|
||||||
const AcReal dt = host_timestep(uref, vAmax, mesh_info);
|
|
||||||
#else
|
|
||||||
const AcReal dt = host_timestep(umax, 0.0l, mesh_info);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if LSINK
|
#if LSINK
|
||||||
|
|
||||||
const AcReal sum_mass = acReduceScal(RTYPE_SUM, VTXBUF_ACCRETION);
|
const AcReal sum_mass = acReduceScal(RTYPE_SUM, VTXBUF_ACCRETION);
|
||||||
@@ -407,7 +406,7 @@ run_simulation(const char* config_path)
|
|||||||
sink_mass = 0.0;
|
sink_mass = 0.0;
|
||||||
sink_mass = mesh_info.real_params[AC_M_sink_init] + accreted_mass;
|
sink_mass = mesh_info.real_params[AC_M_sink_init] + accreted_mass;
|
||||||
acLoadDeviceConstant(AC_M_sink, sink_mass);
|
acLoadDeviceConstant(AC_M_sink, sink_mass);
|
||||||
vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh);
|
vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); //TODO THIS IS A BUG! WILL ONLY SET HOST BUFFER 0!
|
||||||
|
|
||||||
int on_off_switch;
|
int on_off_switch;
|
||||||
if (i < 1) {
|
if (i < 1) {
|
||||||
@@ -417,11 +416,26 @@ run_simulation(const char* config_path)
|
|||||||
on_off_switch = 1;
|
on_off_switch = 1;
|
||||||
}
|
}
|
||||||
acLoadDeviceConstant(AC_switch_accretion, on_off_switch);
|
acLoadDeviceConstant(AC_switch_accretion, on_off_switch);
|
||||||
|
|
||||||
|
//Adjust courant condition for free fall velocity
|
||||||
|
const AcReal RR = mesh_info.real_params[AC_soft]*mesh_info.real_params[AC_soft];
|
||||||
|
const AcReal SQ2GM = sqrt(AcReal(2.0)*mesh_info.real_params[AC_G_const]*sink_mass);
|
||||||
|
uu_freefall = fabs(SQ2GM / sqrt(RR));
|
||||||
#else
|
#else
|
||||||
accreted_mass = -1.0;
|
accreted_mass = -1.0;
|
||||||
sink_mass = -1.0;
|
sink_mass = -1.0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ);
|
||||||
|
#if LBFIELD
|
||||||
|
const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO);
|
||||||
|
const AcReal uref = max(max(umax,uu_freefall), vAmax);
|
||||||
|
const AcReal dt = host_timestep(uref, vAmax, mesh_info);
|
||||||
|
#else
|
||||||
|
const AcReal uref = max(umax,uu_freefall);
|
||||||
|
const AcReal dt = host_timestep(uref, 0.0l, mesh_info);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if LFORCING
|
#if LFORCING
|
||||||
const ForcingParams forcing_params = generateForcingParams(mesh_info);
|
const ForcingParams forcing_params = generateForcingParams(mesh_info);
|
||||||
loadForcingParamsToDevice(forcing_params);
|
loadForcingParamsToDevice(forcing_params);
|
||||||
|
64
scripts/buildtestcases.sh
Executable file
64
scripts/buildtestcases.sh
Executable file
@@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Modules (!!!)
|
||||||
|
module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl
|
||||||
|
#module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl
|
||||||
|
#export UCX_MEMTYPE_CACHE=n # Workaround for bug in hpcx-mpi/2.5.0
|
||||||
|
|
||||||
|
load_default_case() {
|
||||||
|
# Pinned or RDMA
|
||||||
|
sed -i 's/#define MPI_USE_PINNED ([0-9]*)/#define MPI_USE_PINNED (0)/' src/core/device.cc
|
||||||
|
|
||||||
|
# Stencil order
|
||||||
|
sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' acc/stdlib/stdderiv.h
|
||||||
|
sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' include/astaroth.h
|
||||||
|
|
||||||
|
# Timings
|
||||||
|
sed -i 's/MPI_COMPUTE_ENABLED (.)/MPI_COMPUTE_ENABLED (1)/' src/core/device.cc
|
||||||
|
sed -i 's/MPI_COMM_ENABLED (.)/MPI_COMM_ENABLED (1)/' src/core/device.cc
|
||||||
|
sed -i 's/MPI_INCL_CORNERS (.)/MPI_INCL_CORNERS (0)/' src/core/device.cc
|
||||||
|
|
||||||
|
# Decomposition
|
||||||
|
sed -i 's/MPI_DECOMPOSITION_AXES (.)/MPI_DECOMPOSITION_AXES (3)/' src/core/device.cc
|
||||||
|
|
||||||
|
# Strong/Weak
|
||||||
|
sed -i 's/const TestType test = .*;/const TestType test = TEST_STRONG_SCALING;/' samples/benchmark/main.cc
|
||||||
|
|
||||||
|
# Num iters
|
||||||
|
sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 1000;/' samples/benchmark/main.cc
|
||||||
|
}
|
||||||
|
|
||||||
|
# $1 test name
|
||||||
|
# $2 grid size
|
||||||
|
create_case() {
|
||||||
|
DIR="benchmark_$1"
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
/users/pekkila/cmake/build/bin/cmake .. && make -j
|
||||||
|
cd ..
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mesh size
|
||||||
|
load_default_case
|
||||||
|
create_case "meshsize_256"
|
||||||
|
sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 100;/' samples/benchmark/main.cc
|
||||||
|
create_case "meshsize_512"
|
||||||
|
create_case "meshsize_1024"
|
||||||
|
create_case "meshsize_2048"
|
||||||
|
|
||||||
|
# Weak scaling
|
||||||
|
load_default_case
|
||||||
|
sed -i 's/const TestType test = .*;/const TestType test = TEST_WEAK_SCALING;/' samples/benchmark/main.cc
|
||||||
|
create_case "weak_128"
|
||||||
|
create_case "weak_256"
|
||||||
|
sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 100;/' samples/benchmark/main.cc
|
||||||
|
create_case "weak_512"
|
||||||
|
|
||||||
|
# Run batch jobs
|
||||||
|
sbatch benchmark_meshsize_256/benchmark_1.sh
|
||||||
|
sbatch benchmark_meshsize_256/benchmark_2.sh
|
||||||
|
sbatch benchmark_meshsize_256/benchmark_4.sh
|
||||||
|
sbatch benchmark_meshsize_256/benchmark_8.sh
|
||||||
|
sbatch benchmark_meshsize_256/benchmark_16.sh
|
||||||
|
sbatch benchmark_meshsize_256/benchmark_32.sh
|
||||||
|
sbatch benchmark_meshsize_256/benchmark_64.sh
|
41
scripts/postprocess_benchmarks.sh
Executable file
41
scripts/postprocess_benchmarks.sh
Executable file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
OUTPUT=results.csv
|
||||||
|
rm -i $OUTPUT
|
||||||
|
|
||||||
|
# $1 input dir
|
||||||
|
process_input() {
|
||||||
|
echo $1
|
||||||
|
#cat $1/*.csv | sort -n
|
||||||
|
cat $1/*.csv | sort -k1n -k3n | awk '!a[$1]++'
|
||||||
|
echo ""
|
||||||
|
} >> $OUTPUT
|
||||||
|
|
||||||
|
process_input "benchmark_decomp_1D"
|
||||||
|
process_input "benchmark_decomp_2D"
|
||||||
|
process_input "benchmark_decomp_3D"
|
||||||
|
process_input "benchmark_decomp_1D_comm"
|
||||||
|
process_input "benchmark_decomp_2D_comm"
|
||||||
|
process_input "benchmark_decomp_3D_comm"
|
||||||
|
|
||||||
|
process_input "benchmark_meshsize_256"
|
||||||
|
process_input "benchmark_meshsize_512"
|
||||||
|
process_input "benchmark_meshsize_1024"
|
||||||
|
process_input "benchmark_meshsize_2048"
|
||||||
|
|
||||||
|
process_input "benchmark_stencilord_2"
|
||||||
|
process_input "benchmark_stencilord_4"
|
||||||
|
process_input "benchmark_stencilord_6"
|
||||||
|
process_input "benchmark_stencilord_8"
|
||||||
|
|
||||||
|
process_input "benchmark_timings_control"
|
||||||
|
process_input "benchmark_timings_comp"
|
||||||
|
process_input "benchmark_timings_comm"
|
||||||
|
process_input "benchmark_timings_default"
|
||||||
|
process_input "benchmark_timings_corners"
|
||||||
|
|
||||||
|
process_input "benchmark_weak_128"
|
||||||
|
process_input "benchmark_weak_256"
|
||||||
|
process_input "benchmark_weak_512"
|
||||||
|
|
||||||
|
cat $OUTPUT
|
@@ -234,6 +234,23 @@ acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static AcReal
|
||||||
|
randf(void)
|
||||||
|
{
|
||||||
|
return (AcReal)rand() / (AcReal)RAND_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acMeshRandomize(AcMesh* mesh)
|
||||||
|
{
|
||||||
|
const int n = acVertexBufferSize(mesh->info);
|
||||||
|
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
|
||||||
|
for (int i = 0; i < n; ++i)
|
||||||
|
mesh->vertex_buffer[w][i] = randf();
|
||||||
|
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acMeshDestroy(AcMesh* mesh)
|
acMeshDestroy(AcMesh* mesh)
|
||||||
{
|
{
|
||||||
|
@@ -14,7 +14,7 @@
|
|||||||
#define MPI_DECOMPOSITION_AXES (3)
|
#define MPI_DECOMPOSITION_AXES (3)
|
||||||
#define MPI_COMPUTE_ENABLED (1)
|
#define MPI_COMPUTE_ENABLED (1)
|
||||||
#define MPI_COMM_ENABLED (1)
|
#define MPI_COMM_ENABLED (1)
|
||||||
#define MPI_INCL_CORNERS (1)
|
#define MPI_INCL_CORNERS (0)
|
||||||
#define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory
|
#define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory
|
||||||
#define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
|
#define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
|
||||||
|
|
||||||
@@ -742,7 +742,7 @@ acCreatePackedDataHost(const int3 dims)
|
|||||||
data.dims = dims;
|
data.dims = dims;
|
||||||
|
|
||||||
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
|
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
|
||||||
data.data = (AcReal*)malloc(bytes);
|
data.data = (AcRealPacked*)malloc(bytes);
|
||||||
ERRCHK_ALWAYS(data.data);
|
ERRCHK_ALWAYS(data.data);
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
@@ -1153,8 +1153,13 @@ acTransferCommData(const Device device, //
|
|||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
MPI_Datatype datatype = MPI_FLOAT;
|
MPI_Datatype datatype = MPI_FLOAT;
|
||||||
if (sizeof(AcReal) == 8)
|
if (sizeof(data->srcs[0].data[0]) == 2) {
|
||||||
|
datatype = MPI_SHORT; // TODO CONFIRM THAT IS CORRECTLY CAST TO HALF
|
||||||
|
} else if (sizeof(data->srcs[0].data[0]) == 4) {
|
||||||
|
datatype = MPI_FLOAT;
|
||||||
|
} else {
|
||||||
datatype = MPI_DOUBLE;
|
datatype = MPI_DOUBLE;
|
||||||
|
}
|
||||||
|
|
||||||
int nprocs, pid;
|
int nprocs, pid;
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
@@ -1258,6 +1263,20 @@ acGridSynchronizeStream(const Stream stream)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AcResult
|
||||||
|
acGridRandomize(void)
|
||||||
|
{
|
||||||
|
ERRCHK(grid.initialized);
|
||||||
|
|
||||||
|
AcMesh host;
|
||||||
|
acMeshCreate(grid.submesh.info, &host);
|
||||||
|
acMeshRandomize(&host);
|
||||||
|
acDeviceLoadMesh(grid.device, STREAM_DEFAULT, host);
|
||||||
|
acMeshDestroy(&host);
|
||||||
|
|
||||||
|
return AC_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acGridInit(const AcMeshInfo info)
|
acGridInit(const AcMeshInfo info)
|
||||||
{
|
{
|
||||||
|
@@ -8,11 +8,13 @@
|
|||||||
#define MPI_GPUDIRECT_DISABLED (0)
|
#define MPI_GPUDIRECT_DISABLED (0)
|
||||||
#endif // AC_MPI_ENABLED
|
#endif // AC_MPI_ENABLED
|
||||||
|
|
||||||
|
typedef AcReal AcRealPacked;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int3 dims;
|
int3 dims;
|
||||||
AcReal* data;
|
AcRealPacked* data;
|
||||||
|
|
||||||
AcReal* data_pinned;
|
AcRealPacked* data_pinned;
|
||||||
bool pinned = false; // Set if data was received to pinned memory
|
bool pinned = false; // Set if data was received to pinned memory
|
||||||
} PackedData;
|
} PackedData;
|
||||||
|
|
||||||
|
@@ -38,23 +38,6 @@ acMeshSet(const AcReal value, AcMesh* mesh)
|
|||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AcReal
|
|
||||||
randf(void)
|
|
||||||
{
|
|
||||||
return (AcReal)rand() / (AcReal)RAND_MAX;
|
|
||||||
}
|
|
||||||
|
|
||||||
AcResult
|
|
||||||
acMeshRandomize(AcMesh* mesh)
|
|
||||||
{
|
|
||||||
const int n = acVertexBufferSize(mesh->info);
|
|
||||||
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
|
|
||||||
for (int i = 0; i < n; ++i)
|
|
||||||
mesh->vertex_buffer[w][i] = randf();
|
|
||||||
|
|
||||||
return AC_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acMeshApplyPeriodicBounds(AcMesh* mesh)
|
acMeshApplyPeriodicBounds(AcMesh* mesh)
|
||||||
{
|
{
|
||||||
|
@@ -53,10 +53,10 @@ acGetError(const AcReal model, const AcReal candidate)
|
|||||||
const long double e = floorl(logl(fabsl(error.model)) / logl(2));
|
const long double e = floorl(logl(fabsl(error.model)) / logl(2));
|
||||||
|
|
||||||
const long double ulp = powl(base, e - (p - 1));
|
const long double ulp = powl(base, e - (p - 1));
|
||||||
const long double machine_epsilon = 0.5 * powl(base, -(p - 1));
|
const long double machine_epsilon = 0.5l * powl(base, -(p - 1));
|
||||||
error.abs_error = fabsl(model - candidate);
|
error.abs_error = fabsl((long double)model - (long double)candidate);
|
||||||
error.ulp_error = error.abs_error / ulp;
|
error.ulp_error = error.abs_error / ulp;
|
||||||
error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon;
|
error.rel_error = fabsl(1.0l - (long double)candidate / (long double)model) / machine_epsilon;
|
||||||
}
|
}
|
||||||
|
|
||||||
error.maximum_magnitude = error.minimum_magnitude = 0;
|
error.maximum_magnitude = error.minimum_magnitude = 0;
|
||||||
|
Reference in New Issue
Block a user