diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac index c26a5f7..4b0f9bc 100644 --- a/acc/mhd_solver/stencil_kernel.ac +++ b/acc/mhd_solver/stencil_kernel.ac @@ -45,6 +45,7 @@ uniform Scalar AC_zorig; uniform Scalar AC_unit_density; uniform Scalar AC_unit_velocity; uniform Scalar AC_unit_length; +uniform Scalar AC_unit_magnetic; // properties of gravitating star uniform Scalar AC_star_pos_x; uniform Scalar AC_star_pos_y; diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c index 017591a..b1de527 100644 --- a/acc/src/code_generator.c +++ b/acc/src/code_generator.c @@ -716,7 +716,7 @@ external acdevicesynchronizestream fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter); // Streams - const size_t nstreams = 20; + const size_t nstreams = 32; for (size_t i = 0; i < nstreams; ++i) { fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i); fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i); diff --git a/include/astaroth.h b/include/astaroth.h index 512ecd1..07b4d61 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -298,6 +298,9 @@ Resets all devices on the current grid. */ AcResult acGridQuit(void); +/** Randomizes the local mesh */ +AcResult acGridRandomize(void); + /** */ AcResult acGridSynchronizeStream(const Stream stream); @@ -633,6 +636,9 @@ AcResult acUpdateBuiltinParams(AcMeshInfo* config); /** Creates a mesh stored in host memory */ AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh); +/** Randomizes a host mesh */ +AcResult acMeshRandomize(AcMesh* mesh); + /** Destroys a mesh stored in host memory */ AcResult acMeshDestroy(AcMesh* mesh); diff --git a/include/astaroth_utils.h b/include/astaroth_utils.h index f742f73..4289c1b 100644 --- a/include/astaroth_utils.h +++ b/include/astaroth_utils.h @@ -50,9 +50,6 @@ AcResult acVertexBufferSet(const VertexBufferHandle handle, const AcReal value, /** */ AcResult acMeshSet(const AcReal value, AcMesh* mesh); -/** */ -AcResult acMeshRandomize(AcMesh* mesh); - /** */ AcResult acMeshApplyPeriodicBounds(AcMesh* mesh); diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index f205b04..76223bb 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -107,7 +107,7 @@ main(int argc, char** argv) } } - const TestType test = TEST_STRONG_SCALING; + const TestType test = TEST_WEAK_SCALING; if (test == TEST_WEAK_SCALING) { uint3_64 decomp = decompose(nprocs); info.int_params[AC_nx] *= decomp.x; @@ -126,10 +126,15 @@ main(int argc, char** argv) // GPU alloc & compute acGridInit(info); + acGridRandomize(); + + /* AcMesh model; acMeshCreate(info, &model); acMeshRandomize(&model); acGridLoadMesh(STREAM_DEFAULT, model); + */ + /* acGridLoadMesh(STREAM_DEFAULT, model); @@ -154,7 +159,7 @@ main(int argc, char** argv) }*/ // Percentiles - const size_t num_iters = 1000; + const size_t num_iters = 100; const double nth_percentile = 0.90; std::vector results; // ms results.reserve(num_iters); diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index 7be0872..d7b953b 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -17,42 +17,48 @@ main(void) // Boilerplate fprintf(fp, "#!/bin/bash\n"); - fprintf(fp, "#BATCH --job-name=astaroth\n"); - fprintf(fp, "#SBATCH --account=project_2000403\n"); - fprintf(fp, "#SBATCH --time=03:00:00\n"); - fprintf(fp, "#SBATCH --mem=32000\n"); - fprintf(fp, "#SBATCH --partition=gpu\n"); + fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK + fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK + fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK + fprintf(fp, "#SBATCH --mem=0\n"); // OK + fprintf(fp, "#SBATCH --partition=gpu\n"); // OK + fprintf(fp, "#SBATCH --exclusive\n"); // OK + fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs); + // HACK: exclude misconfigured nodes on Puhti + fprintf(fp, "#SBATCH -x " + "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n"); // fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // nprocs, nodes, gpus const int max_gpus_per_node = 4; const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node; const int nodes = (int)ceil((double)nprocs / max_gpus_per_node); - fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); - fprintf(fp, "#SBATCH -n %d\n", nprocs); - fprintf(fp, "#SBATCH -N %d\n", nodes); + fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK + fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK + fprintf(fp, "#SBATCH -N %d\n", nodes); // OK // fprintf(fp, "#SBATCH --exclusive\n"); - if (nprocs >= 4) - fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); + // if (nprocs >= 4) + // fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); // Modules // OpenMPI fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n"); - //fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa - //fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); - //if (nprocs >= 32) - // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa + // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); + // if (nprocs >= 32) + // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa // HPCX - //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); - //fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 + // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); + // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 // Profile and run // fprintf(fp, "mkdir -p profile_%d\n", nprocs); /* - const int nx = 256; // max size 1792; + const int nx = 256; // max size 2048; const int ny = nx; const int nz = nx; @@ -67,11 +73,11 @@ main(void) "benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D", "benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm", "benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024", - "benchmark_meshsize_1792", "benchmark_stencilord_2", "benchmark_stencilord_4", + "benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4", "benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control", "benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default", "benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256", - "benchmark_weak_448", + "benchmark_weak_512", }; for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) { int nn = 256; @@ -79,14 +85,32 @@ main(void) nn = 512; else if (strcmp(files[i], "benchmark_meshsize_1024") == 0) nn = 1024; - else if (strcmp(files[i], "benchmark_meshsize_1792") == 0) - nn = 1792; + else if (strcmp(files[i], "benchmark_meshsize_2048") == 0) + nn = 2048; else if (strcmp(files[i], "benchmark_weak_128") == 0) nn = 128; - else if (strcmp(files[i], "benchmark_weak_448") == 0) - nn = 448; + else if (strcmp(files[i], "benchmark_weak_512") == 0) + nn = 512; - fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn); + // W/ Fredriks tunings + // (may cause Assertion `status == UCS_OK' failed errors) + // fprintf(fp, + // "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + // "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n", + // files[i], nn, nn, nn); + if (nodes >= 2) { + fprintf(fp, + "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm " + "-f core.* && cd ..)\n", + files[i], nn, nn, nn); + } + else { + fprintf(fp, + "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* " + "&& cd ..)\n", + files[i], nn, nn, nn); + } } fclose(fp); diff --git a/samples/standalone/simulation.cc b/samples/standalone/simulation.cc index 1aa5bf2..8fd0d36 100644 --- a/samples/standalone/simulation.cc +++ b/samples/standalone/simulation.cc @@ -43,7 +43,13 @@ // NEED TO BE DEFINED HERE. IS NOT NOTICED BY compile_acc call. #define LFORCING (0) + +#ifdef VTXBUF_ACCRETION +#define LSINK (1) +#else #define LSINK (0) +#endif + #ifdef BFIELDX #define LBFIELD (1) #else @@ -322,6 +328,7 @@ run_simulation(const char* config_path) // acmesh_init_to(INIT_TYPE_SIMPLE_CORE, mesh); //Initial condition for a collapse test #if LSINK + printf("WARNING! Sink particle is under development. USE AT YOUR OWN RISK!") vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); #endif @@ -388,18 +395,10 @@ run_simulation(const char* config_path) /* Step the simulation */ AcReal accreted_mass = 0.0; AcReal sink_mass = 0.0; + AcReal uu_freefall = 0.0; AcReal dt_typical = 0.0; int dtcounter = 0; for (int i = start_step + 1; i < max_steps; ++i) { - const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); -#if LBFIELD - const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO); - const AcReal uref = max(umax, vAmax); - const AcReal dt = host_timestep(uref, vAmax, mesh_info); -#else - const AcReal dt = host_timestep(umax, 0.0l, mesh_info); -#endif - #if LSINK const AcReal sum_mass = acReduceScal(RTYPE_SUM, VTXBUF_ACCRETION); @@ -407,7 +406,7 @@ run_simulation(const char* config_path) sink_mass = 0.0; sink_mass = mesh_info.real_params[AC_M_sink_init] + accreted_mass; acLoadDeviceConstant(AC_M_sink, sink_mass); - vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); + vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); //TODO THIS IS A BUG! WILL ONLY SET HOST BUFFER 0! int on_off_switch; if (i < 1) { @@ -417,11 +416,26 @@ run_simulation(const char* config_path) on_off_switch = 1; } acLoadDeviceConstant(AC_switch_accretion, on_off_switch); + + //Adjust courant condition for free fall velocity + const AcReal RR = mesh_info.real_params[AC_soft]*mesh_info.real_params[AC_soft]; + const AcReal SQ2GM = sqrt(AcReal(2.0)*mesh_info.real_params[AC_G_const]*sink_mass); + uu_freefall = fabs(SQ2GM / sqrt(RR)); #else accreted_mass = -1.0; sink_mass = -1.0; #endif + const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); +#if LBFIELD + const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO); + const AcReal uref = max(max(umax,uu_freefall), vAmax); + const AcReal dt = host_timestep(uref, vAmax, mesh_info); +#else + const AcReal uref = max(umax,uu_freefall); + const AcReal dt = host_timestep(uref, 0.0l, mesh_info); +#endif + #if LFORCING const ForcingParams forcing_params = generateForcingParams(mesh_info); loadForcingParamsToDevice(forcing_params); diff --git a/scripts/buildtestcases.sh b/scripts/buildtestcases.sh new file mode 100755 index 0000000..89a6b02 --- /dev/null +++ b/scripts/buildtestcases.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Modules (!!!) +module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl +#module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl +#export UCX_MEMTYPE_CACHE=n # Workaround for bug in hpcx-mpi/2.5.0 + +load_default_case() { + # Pinned or RDMA + sed -i 's/#define MPI_USE_PINNED ([0-9]*)/#define MPI_USE_PINNED (0)/' src/core/device.cc + + # Stencil order + sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' acc/stdlib/stdderiv.h + sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' include/astaroth.h + + # Timings + sed -i 's/MPI_COMPUTE_ENABLED (.)/MPI_COMPUTE_ENABLED (1)/' src/core/device.cc + sed -i 's/MPI_COMM_ENABLED (.)/MPI_COMM_ENABLED (1)/' src/core/device.cc + sed -i 's/MPI_INCL_CORNERS (.)/MPI_INCL_CORNERS (0)/' src/core/device.cc + + # Decomposition + sed -i 's/MPI_DECOMPOSITION_AXES (.)/MPI_DECOMPOSITION_AXES (3)/' src/core/device.cc + + # Strong/Weak + sed -i 's/const TestType test = .*;/const TestType test = TEST_STRONG_SCALING;/' samples/benchmark/main.cc + + # Num iters + sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 1000;/' samples/benchmark/main.cc +} + +# $1 test name +# $2 grid size +create_case() { + DIR="benchmark_$1" + mkdir -p $DIR + cd $DIR + /users/pekkila/cmake/build/bin/cmake .. && make -j + cd .. +} + +# Mesh size +load_default_case +create_case "meshsize_256" +sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 100;/' samples/benchmark/main.cc +create_case "meshsize_512" +create_case "meshsize_1024" +create_case "meshsize_2048" + +# Weak scaling +load_default_case +sed -i 's/const TestType test = .*;/const TestType test = TEST_WEAK_SCALING;/' samples/benchmark/main.cc +create_case "weak_128" +create_case "weak_256" +sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 100;/' samples/benchmark/main.cc +create_case "weak_512" + +# Run batch jobs +sbatch benchmark_meshsize_256/benchmark_1.sh +sbatch benchmark_meshsize_256/benchmark_2.sh +sbatch benchmark_meshsize_256/benchmark_4.sh +sbatch benchmark_meshsize_256/benchmark_8.sh +sbatch benchmark_meshsize_256/benchmark_16.sh +sbatch benchmark_meshsize_256/benchmark_32.sh +sbatch benchmark_meshsize_256/benchmark_64.sh diff --git a/scripts/postprocess_benchmarks.sh b/scripts/postprocess_benchmarks.sh new file mode 100755 index 0000000..7a60884 --- /dev/null +++ b/scripts/postprocess_benchmarks.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +OUTPUT=results.csv +rm -i $OUTPUT + +# $1 input dir +process_input() { + echo $1 + #cat $1/*.csv | sort -n + cat $1/*.csv | sort -k1n -k3n | awk '!a[$1]++' + echo "" +} >> $OUTPUT + +process_input "benchmark_decomp_1D" +process_input "benchmark_decomp_2D" +process_input "benchmark_decomp_3D" +process_input "benchmark_decomp_1D_comm" +process_input "benchmark_decomp_2D_comm" +process_input "benchmark_decomp_3D_comm" + +process_input "benchmark_meshsize_256" +process_input "benchmark_meshsize_512" +process_input "benchmark_meshsize_1024" +process_input "benchmark_meshsize_2048" + +process_input "benchmark_stencilord_2" +process_input "benchmark_stencilord_4" +process_input "benchmark_stencilord_6" +process_input "benchmark_stencilord_8" + +process_input "benchmark_timings_control" +process_input "benchmark_timings_comp" +process_input "benchmark_timings_comm" +process_input "benchmark_timings_default" +process_input "benchmark_timings_corners" + +process_input "benchmark_weak_128" +process_input "benchmark_weak_256" +process_input "benchmark_weak_512" + +cat $OUTPUT diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc index 086e512..2123224 100644 --- a/src/core/astaroth.cc +++ b/src/core/astaroth.cc @@ -234,6 +234,23 @@ acMeshCreate(const AcMeshInfo info, AcMesh* mesh) return AC_SUCCESS; } +static AcReal +randf(void) +{ + return (AcReal)rand() / (AcReal)RAND_MAX; +} + +AcResult +acMeshRandomize(AcMesh* mesh) +{ + const int n = acVertexBufferSize(mesh->info); + for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) + for (int i = 0; i < n; ++i) + mesh->vertex_buffer[w][i] = randf(); + + return AC_SUCCESS; +} + AcResult acMeshDestroy(AcMesh* mesh) { diff --git a/src/core/device.cc b/src/core/device.cc index 0d2127e..c1bb43f 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -14,7 +14,7 @@ #define MPI_DECOMPOSITION_AXES (3) #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) -#define MPI_INCL_CORNERS (1) +#define MPI_INCL_CORNERS (0) #define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost @@ -742,7 +742,7 @@ acCreatePackedDataHost(const int3 dims) data.dims = dims; const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; - data.data = (AcReal*)malloc(bytes); + data.data = (AcRealPacked*)malloc(bytes); ERRCHK_ALWAYS(data.data); return data; @@ -1153,8 +1153,13 @@ acTransferCommData(const Device device, // cudaSetDevice(device->id); MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) + if (sizeof(data->srcs[0].data[0]) == 2) { + datatype = MPI_SHORT; // TODO CONFIRM THAT IS CORRECTLY CAST TO HALF + } else if (sizeof(data->srcs[0].data[0]) == 4) { + datatype = MPI_FLOAT; + } else { datatype = MPI_DOUBLE; + } int nprocs, pid; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); @@ -1258,6 +1263,20 @@ acGridSynchronizeStream(const Stream stream) return AC_SUCCESS; } +AcResult +acGridRandomize(void) +{ + ERRCHK(grid.initialized); + + AcMesh host; + acMeshCreate(grid.submesh.info, &host); + acMeshRandomize(&host); + acDeviceLoadMesh(grid.device, STREAM_DEFAULT, host); + acMeshDestroy(&host); + + return AC_SUCCESS; +} + AcResult acGridInit(const AcMeshInfo info) { diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h index a9e05f9..b04eaf5 100644 --- a/src/core/kernels/kernels.h +++ b/src/core/kernels/kernels.h @@ -8,11 +8,13 @@ #define MPI_GPUDIRECT_DISABLED (0) #endif // AC_MPI_ENABLED +typedef AcReal AcRealPacked; + typedef struct { int3 dims; - AcReal* data; + AcRealPacked* data; - AcReal* data_pinned; + AcRealPacked* data_pinned; bool pinned = false; // Set if data was received to pinned memory } PackedData; diff --git a/src/utils/memory.c b/src/utils/memory.c index 334d106..f52fa12 100644 --- a/src/utils/memory.c +++ b/src/utils/memory.c @@ -38,23 +38,6 @@ acMeshSet(const AcReal value, AcMesh* mesh) return AC_SUCCESS; } -static AcReal -randf(void) -{ - return (AcReal)rand() / (AcReal)RAND_MAX; -} - -AcResult -acMeshRandomize(AcMesh* mesh) -{ - const int n = acVertexBufferSize(mesh->info); - for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) - for (int i = 0; i < n; ++i) - mesh->vertex_buffer[w][i] = randf(); - - return AC_SUCCESS; -} - AcResult acMeshApplyPeriodicBounds(AcMesh* mesh) { diff --git a/src/utils/verification.c b/src/utils/verification.c index e57498c..3f1bdf6 100644 --- a/src/utils/verification.c +++ b/src/utils/verification.c @@ -53,10 +53,10 @@ acGetError(const AcReal model, const AcReal candidate) const long double e = floorl(logl(fabsl(error.model)) / logl(2)); const long double ulp = powl(base, e - (p - 1)); - const long double machine_epsilon = 0.5 * powl(base, -(p - 1)); - error.abs_error = fabsl(model - candidate); + const long double machine_epsilon = 0.5l * powl(base, -(p - 1)); + error.abs_error = fabsl((long double)model - (long double)candidate); error.ulp_error = error.abs_error / ulp; - error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon; + error.rel_error = fabsl(1.0l - (long double)candidate / (long double)model) / machine_epsilon; } error.maximum_magnitude = error.minimum_magnitude = 0;