From c1f2a6c3401912698c638f62f4bb677f373cb74a Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 28 Oct 2020 12:55:32 +0200 Subject: [PATCH 01/12] Setup for benchmarks --- CMakeLists.txt | 10 +- samples/benchmark/main.cc | 2 +- .../genbenchmarkscripts/CMakeLists.txt | 8 ++ .../genbenchmarkscripts/main.c | 120 ++++++++++++++++++ samples/genbenchmarkscripts/main.c | 72 +++++++---- scripts/buildtestcases.sh | 56 ++++++++ scripts/postprocess_benchmarks.sh | 41 ++++++ 7 files changed, 279 insertions(+), 30 deletions(-) create mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt create mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/main.c create mode 100755 scripts/buildtestcases.sh create mode 100755 scripts/postprocess_benchmarks.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 66b8001..8bfc25e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ project(astaroth C CXX CUDA) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) ## Project-wide compilation flags -set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface +set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") set(CMAKE_C_STANDARD 11) @@ -38,10 +38,10 @@ endif() message(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) ## Options -option(DOUBLE_PRECISION "Generates double precision code." OFF) +option(DOUBLE_PRECISION "Generates double precision code." ON) option(BUILD_SAMPLES "Builds projects in samples subdirectory." ON) -option(MPI_ENABLED "Enables additional functions for MPI communciation." OFF) -option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON) +option(MPI_ENABLED "Enables additional functions for MPI communciation." ON) +option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF) option(VERBOSE "Enables various status and warning messages" OFF) ## Options (DEPRECATED) @@ -110,7 +110,7 @@ if (BUILD_SAMPLES) add_subdirectory(samples/cpptest) add_subdirectory(samples/mpitest) add_subdirectory(samples/benchmark) - #add_subdirectory(samples/genbenchmarkscripts) + add_subdirectory(samples/genbenchmarkscripts) #add_subdirectory(samples/mpi_reduce_bench) add_subdirectory(samples/fortrantest) diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index f205b04..3c5e504 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -154,7 +154,7 @@ main(int argc, char** argv) }*/ // Percentiles - const size_t num_iters = 1000; + const size_t num_iters = 100; const double nth_percentile = 0.90; std::vector results; // ms results.reserve(num_iters); diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt b/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt new file mode 100644 index 0000000..6115fde --- /dev/null +++ b/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(genbenchmarkscripts main.c) + +add_custom_command( + TARGET genbenchmarkscripts POST_BUILD + COMMAND genbenchmarkscripts + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMENT "Generating benchmark scripts" +) diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/genbenchmarkscripts/main.c new file mode 100644 index 0000000..d7b953b --- /dev/null +++ b/samples/genbenchmarkscripts/genbenchmarkscripts/main.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include + +int +main(void) +{ + const int max_nprocs = 64; + for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) { + char filename[4096]; + sprintf(filename, "benchmark_%d.sh", nprocs); + + FILE* fp = fopen(filename, "w"); + assert(fp); + + // Boilerplate + fprintf(fp, "#!/bin/bash\n"); + fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK + fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK + fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK + fprintf(fp, "#SBATCH --mem=0\n"); // OK + fprintf(fp, "#SBATCH --partition=gpu\n"); // OK + fprintf(fp, "#SBATCH --exclusive\n"); // OK + fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK + fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs); + // HACK: exclude misconfigured nodes on Puhti + fprintf(fp, "#SBATCH -x " + "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n"); + // fprintf(fp, "#SBATCH --cpus-per-task=10\n"); + + // nprocs, nodes, gpus + const int max_gpus_per_node = 4; + const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node; + const int nodes = (int)ceil((double)nprocs / max_gpus_per_node); + fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK + fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK + fprintf(fp, "#SBATCH -N %d\n", nodes); // OK + // fprintf(fp, "#SBATCH --exclusive\n"); + // if (nprocs >= 4) + // fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); + + // Modules + // OpenMPI + fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n"); + // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); + // if (nprocs >= 32) + // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa + + // HPCX + // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); + // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 + + // Profile and run + // fprintf(fp, "mkdir -p profile_%d\n", nprocs); + + /* + const int nx = 256; // max size 2048; + const int ny = nx; + const int nz = nx; + + fprintf(fp, + //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " + //"%d\n", + "srun ./benchmark %d %d %d\n", nx, ny, nz); + */ + // fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz); + + const char* files[] = { + "benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D", + "benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm", + "benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024", + "benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4", + "benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control", + "benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default", + "benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256", + "benchmark_weak_512", + }; + for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) { + int nn = 256; + if (strcmp(files[i], "benchmark_meshsize_512") == 0) + nn = 512; + else if (strcmp(files[i], "benchmark_meshsize_1024") == 0) + nn = 1024; + else if (strcmp(files[i], "benchmark_meshsize_2048") == 0) + nn = 2048; + else if (strcmp(files[i], "benchmark_weak_128") == 0) + nn = 128; + else if (strcmp(files[i], "benchmark_weak_512") == 0) + nn = 512; + + // W/ Fredriks tunings + // (may cause Assertion `status == UCS_OK' failed errors) + // fprintf(fp, + // "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + // "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n", + // files[i], nn, nn, nn); + if (nodes >= 2) { + fprintf(fp, + "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm " + "-f core.* && cd ..)\n", + files[i], nn, nn, nn); + } + else { + fprintf(fp, + "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* " + "&& cd ..)\n", + files[i], nn, nn, nn); + } + } + + fclose(fp); + } + + return EXIT_SUCCESS; +} diff --git a/samples/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/main.c index 7be0872..d7b953b 100644 --- a/samples/genbenchmarkscripts/main.c +++ b/samples/genbenchmarkscripts/main.c @@ -17,42 +17,48 @@ main(void) // Boilerplate fprintf(fp, "#!/bin/bash\n"); - fprintf(fp, "#BATCH --job-name=astaroth\n"); - fprintf(fp, "#SBATCH --account=project_2000403\n"); - fprintf(fp, "#SBATCH --time=03:00:00\n"); - fprintf(fp, "#SBATCH --mem=32000\n"); - fprintf(fp, "#SBATCH --partition=gpu\n"); + fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK + fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK + fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK + fprintf(fp, "#SBATCH --mem=0\n"); // OK + fprintf(fp, "#SBATCH --partition=gpu\n"); // OK + fprintf(fp, "#SBATCH --exclusive\n"); // OK + fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs); + // HACK: exclude misconfigured nodes on Puhti + fprintf(fp, "#SBATCH -x " + "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n"); // fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // nprocs, nodes, gpus const int max_gpus_per_node = 4; const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node; const int nodes = (int)ceil((double)nprocs / max_gpus_per_node); - fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); - fprintf(fp, "#SBATCH -n %d\n", nprocs); - fprintf(fp, "#SBATCH -N %d\n", nodes); + fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK + fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK + fprintf(fp, "#SBATCH -N %d\n", nodes); // OK // fprintf(fp, "#SBATCH --exclusive\n"); - if (nprocs >= 4) - fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); + // if (nprocs >= 4) + // fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); // Modules // OpenMPI fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n"); - //fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa - //fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); - //if (nprocs >= 32) - // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // https://www.open-mpi.org/fa + // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); + // if (nprocs >= 32) + // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // + // https://www.open-mpi.org/fa // HPCX - //fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); - //fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 + // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); + // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 // Profile and run // fprintf(fp, "mkdir -p profile_%d\n", nprocs); /* - const int nx = 256; // max size 1792; + const int nx = 256; // max size 2048; const int ny = nx; const int nz = nx; @@ -67,11 +73,11 @@ main(void) "benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D", "benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm", "benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024", - "benchmark_meshsize_1792", "benchmark_stencilord_2", "benchmark_stencilord_4", + "benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4", "benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control", "benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default", "benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256", - "benchmark_weak_448", + "benchmark_weak_512", }; for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) { int nn = 256; @@ -79,14 +85,32 @@ main(void) nn = 512; else if (strcmp(files[i], "benchmark_meshsize_1024") == 0) nn = 1024; - else if (strcmp(files[i], "benchmark_meshsize_1792") == 0) - nn = 1792; + else if (strcmp(files[i], "benchmark_meshsize_2048") == 0) + nn = 2048; else if (strcmp(files[i], "benchmark_weak_128") == 0) nn = 128; - else if (strcmp(files[i], "benchmark_weak_448") == 0) - nn = 448; + else if (strcmp(files[i], "benchmark_weak_512") == 0) + nn = 512; - fprintf(fp, "$(cd %s && srun ./benchmark %d %d %d && cd ..)\n", files[i], nn, nn, nn); + // W/ Fredriks tunings + // (may cause Assertion `status == UCS_OK' failed errors) + // fprintf(fp, + // "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + // "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n", + // files[i], nn, nn, nn); + if (nodes >= 2) { + fprintf(fp, + "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " + "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm " + "-f core.* && cd ..)\n", + files[i], nn, nn, nn); + } + else { + fprintf(fp, + "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* " + "&& cd ..)\n", + files[i], nn, nn, nn); + } } fclose(fp); diff --git a/scripts/buildtestcases.sh b/scripts/buildtestcases.sh new file mode 100755 index 0000000..7157656 --- /dev/null +++ b/scripts/buildtestcases.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Modules (!!!) +module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl +#module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl +#export UCX_MEMTYPE_CACHE=n # Workaround for bug in hpcx-mpi/2.5.0 + +load_default_case() { + # Pinned or RDMA + sed -i 's/#define MPI_USE_PINNED ([0-9]*)/#define MPI_USE_PINNED (0)/' src/core/device.cc + + # Stencil order + sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' acc/stdlib/stdderiv.h + sed -i 's/#define STENCIL_ORDER ([0-9]*)/#define STENCIL_ORDER (6)/' include/astaroth.h + + # Timings + sed -i 's/MPI_COMPUTE_ENABLED (.)/MPI_COMPUTE_ENABLED (1)/' src/core/device.cc + sed -i 's/MPI_COMM_ENABLED (.)/MPI_COMM_ENABLED (1)/' src/core/device.cc + sed -i 's/MPI_INCL_CORNERS (.)/MPI_INCL_CORNERS (0)/' src/core/device.cc + + # Decomposition + sed -i 's/MPI_DECOMPOSITION_AXES (.)/MPI_DECOMPOSITION_AXES (3)/' src/core/device.cc + + # Strong/Weak + sed -i 's/const TestType test = .*;/const TestType test = TEST_STRONG_SCALING;/' samples/benchmark/main.cc + + # Num iters + sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 1000;/' samples/benchmark/main.cc +} + +# $1 test name +# $2 grid size +create_case() { + DIR="benchmark_$1" + mkdir -p $DIR + cd $DIR + /users/pekkila/cmake/build/bin/cmake .. && make -j + cd .. +} + +# Mesh size +load_default_case +create_case "meshsize_256" +sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 100;/' samples/benchmark/main.cc +create_case "meshsize_512" +create_case "meshsize_1024" +create_case "meshsize_2048" + +# Run batch jobs +sbatch benchmark_meshsize_256/benchmark_1.sh +sbatch benchmark_meshsize_256/benchmark_2.sh +sbatch benchmark_meshsize_256/benchmark_4.sh +sbatch benchmark_meshsize_256/benchmark_8.sh +sbatch benchmark_meshsize_256/benchmark_16.sh +sbatch benchmark_meshsize_256/benchmark_32.sh +sbatch benchmark_meshsize_256/benchmark_64.sh diff --git a/scripts/postprocess_benchmarks.sh b/scripts/postprocess_benchmarks.sh new file mode 100755 index 0000000..7a60884 --- /dev/null +++ b/scripts/postprocess_benchmarks.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +OUTPUT=results.csv +rm -i $OUTPUT + +# $1 input dir +process_input() { + echo $1 + #cat $1/*.csv | sort -n + cat $1/*.csv | sort -k1n -k3n | awk '!a[$1]++' + echo "" +} >> $OUTPUT + +process_input "benchmark_decomp_1D" +process_input "benchmark_decomp_2D" +process_input "benchmark_decomp_3D" +process_input "benchmark_decomp_1D_comm" +process_input "benchmark_decomp_2D_comm" +process_input "benchmark_decomp_3D_comm" + +process_input "benchmark_meshsize_256" +process_input "benchmark_meshsize_512" +process_input "benchmark_meshsize_1024" +process_input "benchmark_meshsize_2048" + +process_input "benchmark_stencilord_2" +process_input "benchmark_stencilord_4" +process_input "benchmark_stencilord_6" +process_input "benchmark_stencilord_8" + +process_input "benchmark_timings_control" +process_input "benchmark_timings_comp" +process_input "benchmark_timings_comm" +process_input "benchmark_timings_default" +process_input "benchmark_timings_corners" + +process_input "benchmark_weak_128" +process_input "benchmark_weak_256" +process_input "benchmark_weak_512" + +cat $OUTPUT From 0a2827593c3479f1b117476663da70f6e6263972 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 28 Oct 2020 12:56:34 +0200 Subject: [PATCH 02/12] Added very experimental implementation for mixed precision. Comm is done with f32 and comp with f64. --- src/core/device.cc | 11 ++++++++--- src/core/kernels/kernels.h | 6 ++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/core/device.cc b/src/core/device.cc index a6ec793..0ee987f 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -14,7 +14,7 @@ #define MPI_DECOMPOSITION_AXES (3) #define MPI_COMPUTE_ENABLED (1) #define MPI_COMM_ENABLED (1) -#define MPI_INCL_CORNERS (1) +#define MPI_INCL_CORNERS (0) #define MPI_USE_PINNED (0) // Do inter-node comm with pinned memory #define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost @@ -721,7 +721,7 @@ acCreatePackedDataHost(const int3 dims) data.dims = dims; const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES; - data.data = (AcReal*)malloc(bytes); + data.data = (AcRealPacked*)malloc(bytes); ERRCHK_ALWAYS(data.data); return data; @@ -1132,8 +1132,13 @@ acTransferCommData(const Device device, // cudaSetDevice(device->id); MPI_Datatype datatype = MPI_FLOAT; - if (sizeof(AcReal) == 8) + if (sizeof(data->srcs[0].data[0]) == 2) { + datatype = MPI_SHORT; // TODO CONFIRM THAT IS CORRECTLY CAST TO HALF + } else if (sizeof(data->srcs[0].data[0]) == 4) { + datatype = MPI_FLOAT; + } else { datatype = MPI_DOUBLE; + } int nprocs, pid; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h index fc9c745..513c5e4 100644 --- a/src/core/kernels/kernels.h +++ b/src/core/kernels/kernels.h @@ -8,11 +8,13 @@ #define MPI_GPUDIRECT_DISABLED (0) #endif // AC_MPI_ENABLED +typedef float AcRealPacked; + typedef struct { int3 dims; - AcReal* data; + AcRealPacked* data; - AcReal* data_pinned; + AcRealPacked* data_pinned; bool pinned = false; // Set if data was received to pinned memory } PackedData; From ae0d4de23c3ddc01d0439f419a0150204550f1c3 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Thu, 29 Oct 2020 16:33:41 +0200 Subject: [PATCH 03/12] The root host mesh is no longer allocated during benchmarking as this caused out-of-memory errors in weak scaling tests --- include/astaroth.h | 3 +++ samples/benchmark/main.cc | 7 ++++++- src/core/CMakeLists.txt | 2 +- src/core/device.cc | 16 ++++++++++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/astaroth.h b/include/astaroth.h index d32b367..039ab82 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -284,6 +284,9 @@ Resets all devices on the current grid. */ AcResult acGridQuit(void); +/** Randomizes the local mesh */ +AcResult acGridRandomize(void); + /** */ AcResult acGridSynchronizeStream(const Stream stream); diff --git a/samples/benchmark/main.cc b/samples/benchmark/main.cc index 3c5e504..76223bb 100644 --- a/samples/benchmark/main.cc +++ b/samples/benchmark/main.cc @@ -107,7 +107,7 @@ main(int argc, char** argv) } } - const TestType test = TEST_STRONG_SCALING; + const TestType test = TEST_WEAK_SCALING; if (test == TEST_WEAK_SCALING) { uint3_64 decomp = decompose(nprocs); info.int_params[AC_nx] *= decomp.x; @@ -126,10 +126,15 @@ main(int argc, char** argv) // GPU alloc & compute acGridInit(info); + acGridRandomize(); + + /* AcMesh model; acMeshCreate(info, &model); acMeshRandomize(&model); acGridLoadMesh(STREAM_DEFAULT, model); + */ + /* acGridLoadMesh(STREAM_DEFAULT, model); diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index ca96366..15cd4fa 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -2,7 +2,7 @@ find_package(CUDAToolkit) ## Astaroth Core add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc) -target_link_libraries(astaroth_core astaroth_kernels CUDA::cudart CUDA::cuda_driver) +target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver) ## Options if (MPI_ENABLED) diff --git a/src/core/device.cc b/src/core/device.cc index 0ee987f..4af17fc 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -1242,6 +1242,22 @@ acGridSynchronizeStream(const Stream stream) return AC_SUCCESS; } + +#include "astaroth_utils.h" // HACK TO RANDOMIZE +AcResult +acGridRandomize(void) +{ + ERRCHK(grid.initialized); + + AcMesh host; + acMeshCreate(grid.submesh.info, &host); + acMeshRandomize(&host); + acDeviceLoadMesh(grid.device, STREAM_DEFAULT, host); + acMeshDestroy(&host); + + return AC_SUCCESS; +} + AcResult acGridInit(const AcMeshInfo info) { From bf7eb83084f3e13b590de1da344ac4fa238d7350 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Thu, 29 Oct 2020 16:34:48 +0200 Subject: [PATCH 04/12] Updated benchmark script --- scripts/buildtestcases.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/buildtestcases.sh b/scripts/buildtestcases.sh index 7157656..89a6b02 100755 --- a/scripts/buildtestcases.sh +++ b/scripts/buildtestcases.sh @@ -46,6 +46,14 @@ create_case "meshsize_512" create_case "meshsize_1024" create_case "meshsize_2048" +# Weak scaling +load_default_case +sed -i 's/const TestType test = .*;/const TestType test = TEST_WEAK_SCALING;/' samples/benchmark/main.cc +create_case "weak_128" +create_case "weak_256" +sed -i 's/const size_t num_iters = .*;/const size_t num_iters = 100;/' samples/benchmark/main.cc +create_case "weak_512" + # Run batch jobs sbatch benchmark_meshsize_256/benchmark_1.sh sbatch benchmark_meshsize_256/benchmark_2.sh From 00b7b537cea6f3c1a3f8c9c433f352156a37daac Mon Sep 17 00:00:00 2001 From: jpekkila Date: Mon, 2 Nov 2020 10:58:18 +0200 Subject: [PATCH 05/12] Modifications for master merge: reverted CMakeLists.txt to the original, disabled mixed precision by default --- CMakeLists.txt | 10 +++++----- src/core/kernels/kernels.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8bfc25e..66b8001 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ project(astaroth C CXX CUDA) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) ## Project-wide compilation flags -set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface +set(COMMON_FLAGS "-mavx -DOMPI_SKIP_MPICXX -Wall -Wextra -Werror -Wdouble-promotion -Wfloat-conversion -Wshadow") # -DOMPI_SKIP_MPICXX is to force OpenMPI to use the C interface set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}") set(CMAKE_C_STANDARD 11) @@ -38,10 +38,10 @@ endif() message(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) ## Options -option(DOUBLE_PRECISION "Generates double precision code." ON) +option(DOUBLE_PRECISION "Generates double precision code." OFF) option(BUILD_SAMPLES "Builds projects in samples subdirectory." ON) -option(MPI_ENABLED "Enables additional functions for MPI communciation." ON) -option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF) +option(MPI_ENABLED "Enables additional functions for MPI communciation." OFF) +option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON) option(VERBOSE "Enables various status and warning messages" OFF) ## Options (DEPRECATED) @@ -110,7 +110,7 @@ if (BUILD_SAMPLES) add_subdirectory(samples/cpptest) add_subdirectory(samples/mpitest) add_subdirectory(samples/benchmark) - add_subdirectory(samples/genbenchmarkscripts) + #add_subdirectory(samples/genbenchmarkscripts) #add_subdirectory(samples/mpi_reduce_bench) add_subdirectory(samples/fortrantest) diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h index 513c5e4..282f1c4 100644 --- a/src/core/kernels/kernels.h +++ b/src/core/kernels/kernels.h @@ -8,7 +8,7 @@ #define MPI_GPUDIRECT_DISABLED (0) #endif // AC_MPI_ENABLED -typedef float AcRealPacked; +typedef AcReal AcRealPacked; typedef struct { int3 dims; From d48a478254c1917f17841055aca8cc5a91157fb1 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Mon, 2 Nov 2020 16:39:01 +0200 Subject: [PATCH 06/12] Removed duplicate genbenchmarkscripts --- .../genbenchmarkscripts/CMakeLists.txt | 8 -- .../genbenchmarkscripts/main.c | 120 ------------------ 2 files changed, 128 deletions(-) delete mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt delete mode 100644 samples/genbenchmarkscripts/genbenchmarkscripts/main.c diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt b/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt deleted file mode 100644 index 6115fde..0000000 --- a/samples/genbenchmarkscripts/genbenchmarkscripts/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -add_executable(genbenchmarkscripts main.c) - -add_custom_command( - TARGET genbenchmarkscripts POST_BUILD - COMMAND genbenchmarkscripts - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMENT "Generating benchmark scripts" -) diff --git a/samples/genbenchmarkscripts/genbenchmarkscripts/main.c b/samples/genbenchmarkscripts/genbenchmarkscripts/main.c deleted file mode 100644 index d7b953b..0000000 --- a/samples/genbenchmarkscripts/genbenchmarkscripts/main.c +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include -#include -#include - -int -main(void) -{ - const int max_nprocs = 64; - for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) { - char filename[4096]; - sprintf(filename, "benchmark_%d.sh", nprocs); - - FILE* fp = fopen(filename, "w"); - assert(fp); - - // Boilerplate - fprintf(fp, "#!/bin/bash\n"); - fprintf(fp, "#BATCH --job-name=astaroth\n"); // OK - fprintf(fp, "#SBATCH --account=project_2000403\n"); // OK - fprintf(fp, "#SBATCH --time=04:00:00\n"); // OK - fprintf(fp, "#SBATCH --mem=0\n"); // OK - fprintf(fp, "#SBATCH --partition=gpu\n"); // OK - fprintf(fp, "#SBATCH --exclusive\n"); // OK - fprintf(fp, "#SBATCH --cpus-per-task=10\n"); // OK - fprintf(fp, "#SBATCH --output=benchmark-%d-%%j.out\n", nprocs); - // HACK: exclude misconfigured nodes on Puhti - fprintf(fp, "#SBATCH -x " - "r04g[05-06],r02g02,r14g04,r04g07,r16g07,r18g[02-03],r15g08,r17g06,r13g04\n"); - // fprintf(fp, "#SBATCH --cpus-per-task=10\n"); - - // nprocs, nodes, gpus - const int max_gpus_per_node = 4; - const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node; - const int nodes = (int)ceil((double)nprocs / max_gpus_per_node); - fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node); // OK - fprintf(fp, "#SBATCH -n %d\n", nprocs); // OK - fprintf(fp, "#SBATCH -N %d\n", nodes); // OK - // fprintf(fp, "#SBATCH --exclusive\n"); - // if (nprocs >= 4) - // fprintf(fp, "#SBATCH --ntasks-per-socket=2\n"); - - // Modules - // OpenMPI - fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi/4.0.3-cuda nccl\n"); - // fprintf(fp, "export UCX_TLS=rc,sm,cuda_copy,gdr_copy,cuda_ipc\n"); // - // https://www.open-mpi.org/fa fprintf(fp, "export PSM2_CUDA=1\nexport PSM2_GPUDIRECT=1\n"); - // if (nprocs >= 32) - // fprintf(fp, "export UCX_TLS=ud_x,cuda_copy,gdr_copy,cuda_ipc\n"); // - // https://www.open-mpi.org/fa - - // HPCX - // fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n"); - // fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n"); // Workaround for bug in hpcx-mpi/2.5.0 - - // Profile and run - // fprintf(fp, "mkdir -p profile_%d\n", nprocs); - - /* - const int nx = 256; // max size 2048; - const int ny = nx; - const int nz = nx; - - fprintf(fp, - //"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d " - //"%d\n", - "srun ./benchmark %d %d %d\n", nx, ny, nz); - */ - // fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz); - - const char* files[] = { - "benchmark_decomp_1D", "benchmark_decomp_2D", "benchmark_decomp_3D", - "benchmark_decomp_1D_comm", "benchmark_decomp_2D_comm", "benchmark_decomp_3D_comm", - "benchmark_meshsize_256", "benchmark_meshsize_512", "benchmark_meshsize_1024", - "benchmark_meshsize_2048", "benchmark_stencilord_2", "benchmark_stencilord_4", - "benchmark_stencilord_6", "benchmark_stencilord_8", "benchmark_timings_control", - "benchmark_timings_comp", "benchmark_timings_comm", "benchmark_timings_default", - "benchmark_timings_corners", "benchmark_weak_128", "benchmark_weak_256", - "benchmark_weak_512", - }; - for (size_t i = 0; i < sizeof(files) / sizeof(files[0]); ++i) { - int nn = 256; - if (strcmp(files[i], "benchmark_meshsize_512") == 0) - nn = 512; - else if (strcmp(files[i], "benchmark_meshsize_1024") == 0) - nn = 1024; - else if (strcmp(files[i], "benchmark_meshsize_2048") == 0) - nn = 2048; - else if (strcmp(files[i], "benchmark_weak_128") == 0) - nn = 128; - else if (strcmp(files[i], "benchmark_weak_512") == 0) - nn = 512; - - // W/ Fredriks tunings - // (may cause Assertion `status == UCS_OK' failed errors) - // fprintf(fp, - // "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " - // "UCX_MAX_RNDV_RAILS=1 srun ./benchmark %d %d %d && cd ..)\n", - // files[i], nn, nn, nn); - if (nodes >= 2) { - fprintf(fp, - "$(cd %s && UCX_RNDV_THRESH=16384 UCX_RNDV_SCHEME=get_zcopy " - "UCX_MAX_RNDV_RAILS=1 srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm " - "-f core.* && cd ..)\n", - files[i], nn, nn, nn); - } - else { - fprintf(fp, - "$(cd %s && srun --kill-on-bad-exit=0 ./benchmark %d %d %d && rm -f core.* " - "&& cd ..)\n", - files[i], nn, nn, nn); - } - } - - fclose(fp); - } - - return EXIT_SUCCESS; -} From 349093768d8ab0af52d38341e7ab08ba8bf0f937 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Mon, 2 Nov 2020 17:14:26 +0200 Subject: [PATCH 07/12] Added acMeshRandomize to astaroth.h to keep core and utils separate --- include/astaroth.h | 3 +++ include/astaroth_utils.h | 3 --- src/core/CMakeLists.txt | 2 +- src/core/astaroth.cc | 17 +++++++++++++++++ src/core/device.cc | 2 -- src/utils/memory.c | 17 ----------------- 6 files changed, 21 insertions(+), 23 deletions(-) diff --git a/include/astaroth.h b/include/astaroth.h index 039ab82..21e45b9 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -590,6 +590,9 @@ AcResult acUpdateBuiltinParams(AcMeshInfo* config); /** Creates a mesh stored in host memory */ AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh); +/** Randomizes a host mesh */ +AcResult acMeshRandomize(AcMesh* mesh); + /** Destroys a mesh stored in host memory */ AcResult acMeshDestroy(AcMesh* mesh); diff --git a/include/astaroth_utils.h b/include/astaroth_utils.h index f742f73..4289c1b 100644 --- a/include/astaroth_utils.h +++ b/include/astaroth_utils.h @@ -50,9 +50,6 @@ AcResult acVertexBufferSet(const VertexBufferHandle handle, const AcReal value, /** */ AcResult acMeshSet(const AcReal value, AcMesh* mesh); -/** */ -AcResult acMeshRandomize(AcMesh* mesh); - /** */ AcResult acMeshApplyPeriodicBounds(AcMesh* mesh); diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 15cd4fa..ca96366 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -2,7 +2,7 @@ find_package(CUDAToolkit) ## Astaroth Core add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc) -target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver) +target_link_libraries(astaroth_core astaroth_kernels CUDA::cudart CUDA::cuda_driver) ## Options if (MPI_ENABLED) diff --git a/src/core/astaroth.cc b/src/core/astaroth.cc index 1c8d0f7..c6d9a19 100644 --- a/src/core/astaroth.cc +++ b/src/core/astaroth.cc @@ -221,6 +221,23 @@ acMeshCreate(const AcMeshInfo info, AcMesh* mesh) return AC_SUCCESS; } +static AcReal +randf(void) +{ + return (AcReal)rand() / (AcReal)RAND_MAX; +} + +AcResult +acMeshRandomize(AcMesh* mesh) +{ + const int n = acVertexBufferSize(mesh->info); + for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) + for (int i = 0; i < n; ++i) + mesh->vertex_buffer[w][i] = randf(); + + return AC_SUCCESS; +} + AcResult acMeshDestroy(AcMesh* mesh) { diff --git a/src/core/device.cc b/src/core/device.cc index 4af17fc..24d6e38 100644 --- a/src/core/device.cc +++ b/src/core/device.cc @@ -1242,8 +1242,6 @@ acGridSynchronizeStream(const Stream stream) return AC_SUCCESS; } - -#include "astaroth_utils.h" // HACK TO RANDOMIZE AcResult acGridRandomize(void) { diff --git a/src/utils/memory.c b/src/utils/memory.c index 334d106..f52fa12 100644 --- a/src/utils/memory.c +++ b/src/utils/memory.c @@ -38,23 +38,6 @@ acMeshSet(const AcReal value, AcMesh* mesh) return AC_SUCCESS; } -static AcReal -randf(void) -{ - return (AcReal)rand() / (AcReal)RAND_MAX; -} - -AcResult -acMeshRandomize(AcMesh* mesh) -{ - const int n = acVertexBufferSize(mesh->info); - for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) - for (int i = 0; i < n; ++i) - mesh->vertex_buffer[w][i] = randf(); - - return AC_SUCCESS; -} - AcResult acMeshApplyPeriodicBounds(AcMesh* mesh) { From dff560561ecd9ed4e553aeaab57d57486c22b47d Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 3 Nov 2020 10:28:58 +0200 Subject: [PATCH 08/12] Explicit casting during error checking --- src/utils/verification.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/utils/verification.c b/src/utils/verification.c index e57498c..3f1bdf6 100644 --- a/src/utils/verification.c +++ b/src/utils/verification.c @@ -53,10 +53,10 @@ acGetError(const AcReal model, const AcReal candidate) const long double e = floorl(logl(fabsl(error.model)) / logl(2)); const long double ulp = powl(base, e - (p - 1)); - const long double machine_epsilon = 0.5 * powl(base, -(p - 1)); - error.abs_error = fabsl(model - candidate); + const long double machine_epsilon = 0.5l * powl(base, -(p - 1)); + error.abs_error = fabsl((long double)model - (long double)candidate); error.ulp_error = error.abs_error / ulp; - error.rel_error = fabsl(1.0l - candidate / model) / machine_epsilon; + error.rel_error = fabsl(1.0l - (long double)candidate / (long double)model) / machine_epsilon; } error.maximum_magnitude = error.minimum_magnitude = 0; From f61223c02b21bd417189081ee97e7f88ea85a22a Mon Sep 17 00:00:00 2001 From: jpekkila Date: Wed, 11 Nov 2020 13:18:29 +0200 Subject: [PATCH 09/12] The number of default streams is now 32 --- acc/src/code_generator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acc/src/code_generator.c b/acc/src/code_generator.c index 017591a..b1de527 100644 --- a/acc/src/code_generator.c +++ b/acc/src/code_generator.c @@ -716,7 +716,7 @@ external acdevicesynchronizestream fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter); // Streams - const size_t nstreams = 20; + const size_t nstreams = 32; for (size_t i = 0; i < nstreams; ++i) { fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i); fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i); From a463fd492f64c8386cc9ab4aa032a6dad25c3936 Mon Sep 17 00:00:00 2001 From: Miikka Vaisala Date: Thu, 19 Nov 2020 14:31:10 +0800 Subject: [PATCH 10/12] Synched simulation.cc with existing work. --- samples/standalone/simulation.cc | 33 ++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/samples/standalone/simulation.cc b/samples/standalone/simulation.cc index 0bf193e..178ff17 100644 --- a/samples/standalone/simulation.cc +++ b/samples/standalone/simulation.cc @@ -43,7 +43,13 @@ // NEED TO BE DEFINED HERE. IS NOT NOTICED BY compile_acc call. #define LFORCING (0) + +#ifdef VTXBUF_ACCRETION +#define LSINK (1) +#else #define LSINK (0) +#endif + #ifdef BFIELDX #define LBFIELD (1) #else @@ -322,6 +328,7 @@ run_simulation(const char* config_path) // acmesh_init_to(INIT_TYPE_SIMPLE_CORE, mesh); //Initial condition for a collapse test #if LSINK + printf("WARNING! Sink particle is under development. USE AT YOUR OWN RISK!") vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); #endif @@ -387,18 +394,10 @@ run_simulation(const char* config_path) /* Step the simulation */ AcReal accreted_mass = 0.0; AcReal sink_mass = 0.0; + AcReal uu_freefall = 0.0; AcReal dt_typical = 0.0; int dtcounter = 0; for (int i = start_step + 1; i < max_steps; ++i) { - const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); -#if LBFIELD - const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO); - const AcReal uref = max(umax, vAmax); - const AcReal dt = host_timestep(uref, vAmax, mesh_info); -#else - const AcReal dt = host_timestep(umax, 0.0l, mesh_info); -#endif - #if LSINK const AcReal sum_mass = acReduceScal(RTYPE_SUM, VTXBUF_ACCRETION); @@ -406,7 +405,7 @@ run_simulation(const char* config_path) sink_mass = 0.0; sink_mass = mesh_info.real_params[AC_M_sink_init] + accreted_mass; acLoadDeviceConstant(AC_M_sink, sink_mass); - vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); + vertex_buffer_set(VTXBUF_ACCRETION, 0.0, mesh); //TODO THIS IS A BUG! WILL ONLY SET HOST BUFFER 0! int on_off_switch; if (i < 1) { @@ -416,11 +415,25 @@ run_simulation(const char* config_path) on_off_switch = 1; } acLoadDeviceConstant(AC_switch_accretion, on_off_switch); + + //Adjust courant condition for free fall velocity + const AcReal RR = mesh_info.real_params[AC_soft]*mesh_info.real_params[AC_soft]; + const AcReal SQ2GM = sqrt(AcReal(2.0)*mesh_info.real_params[AC_G_const]*sink_mass); + uu_freefall = fabs(SQ2GM / sqrt(RR)); #else accreted_mass = -1.0; sink_mass = -1.0; #endif + const AcReal umax = acReduceVec(RTYPE_MAX, VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ); +#if LBFIELD + const AcReal vAmax = acReduceVecScal(RTYPE_ALFVEN_MAX, BFIELDX, BFIELDY, BFIELDZ, VTXBUF_LNRHO); + const AcReal uref = max(max(umax,uu_freefall), vAmax); + const AcReal dt = host_timestep(uref, vAmax, mesh_info); +#else + const AcReal dt = host_timestep(umax, 0.0l, mesh_info); +#endif + #if LFORCING const ForcingParams forcing_params = generateForcingParams(mesh_info); loadForcingParamsToDevice(forcing_params); From 204f0753437c9e91bf1ab4fe8c618a421788cbe1 Mon Sep 17 00:00:00 2001 From: Miikka Vaisala Date: Thu, 19 Nov 2020 15:14:28 +0800 Subject: [PATCH 11/12] AC_unit_magnetic in dsl --- acc/mhd_solver/stencil_kernel.ac | 1 + 1 file changed, 1 insertion(+) diff --git a/acc/mhd_solver/stencil_kernel.ac b/acc/mhd_solver/stencil_kernel.ac index e0efa94..db83a3f 100644 --- a/acc/mhd_solver/stencil_kernel.ac +++ b/acc/mhd_solver/stencil_kernel.ac @@ -39,6 +39,7 @@ uniform Scalar AC_zorig; uniform Scalar AC_unit_density; uniform Scalar AC_unit_velocity; uniform Scalar AC_unit_length; +uniform Scalar AC_unit_magnetic; // properties of gravitating star uniform Scalar AC_star_pos_x; uniform Scalar AC_star_pos_y; From e3eb7822132465d5c06a0956d4ef6c6bf8428359 Mon Sep 17 00:00:00 2001 From: Miikka Vaisala Date: Fri, 20 Nov 2020 11:11:54 +0800 Subject: [PATCH 12/12] Sorry for the compilation problem. Corrected. --- samples/standalone/simulation.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/standalone/simulation.cc b/samples/standalone/simulation.cc index 178ff17..4e0e176 100644 --- a/samples/standalone/simulation.cc +++ b/samples/standalone/simulation.cc @@ -431,7 +431,8 @@ run_simulation(const char* config_path) const AcReal uref = max(max(umax,uu_freefall), vAmax); const AcReal dt = host_timestep(uref, vAmax, mesh_info); #else - const AcReal dt = host_timestep(umax, 0.0l, mesh_info); + const AcReal uref = max(umax,uu_freefall); + const AcReal dt = host_timestep(uref, 0.0l, mesh_info); #endif #if LFORCING