Merged mpi-to-master-merge-candidate-2020-06-01 here
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
## CMake settings
|
## CMake settings
|
||||||
# V3.9 required for first-class CUDA support
|
# V3.9 required for first-class CUDA support
|
||||||
# V3.17 required for the FindCUDAToolkit package
|
# V3.17 required for the FindCUDAToolkit package
|
||||||
cmake_minimum_required(VERSION 3.17)
|
cmake_minimum_required(VERSION 3.17)
|
||||||
find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
|
find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
|
||||||
find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)
|
find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)
|
||||||
|
|
||||||
@@ -30,11 +30,11 @@ endif()
|
|||||||
message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
|
message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
|
||||||
|
|
||||||
## Options
|
## Options
|
||||||
option(DOUBLE_PRECISION "Generates double precision code." OFF)
|
option(DOUBLE_PRECISION "Generates double precision code." ON)
|
||||||
option(BUILD_SAMPLES "Builds projects in samples subdirectory." OFF)
|
option(BUILD_SAMPLES "Builds projects in samples subdirectory." ON)
|
||||||
option(BUILD_STANDALONE "Builds standalone Astaroth." ON)
|
option(BUILD_STANDALONE "Builds standalone Astaroth." OFF)
|
||||||
option(MPI_ENABLED "Enables additional functions for MPI communciation." OFF)
|
option(MPI_ENABLED "Enables additional functions for MPI communciation." ON)
|
||||||
option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." ON)
|
option(MULTIGPU_ENABLED "Enables multi-GPU on a single node. Uses peer-to-peer communication instead of MPI. Affects Legacy & Node layers only." OFF)
|
||||||
|
|
||||||
## Options (DEPRECATED)
|
## Options (DEPRECATED)
|
||||||
# option(BUILD_DEBUG "Builds the program with extensive error checking" OFF)
|
# option(BUILD_DEBUG "Builds the program with extensive error checking" OFF)
|
||||||
@@ -96,11 +96,6 @@ if (BUILD_SAMPLES)
|
|||||||
add_subdirectory(samples/cpptest)
|
add_subdirectory(samples/cpptest)
|
||||||
add_subdirectory(samples/mpitest)
|
add_subdirectory(samples/mpitest)
|
||||||
add_subdirectory(samples/benchmark)
|
add_subdirectory(samples/benchmark)
|
||||||
add_subdirectory(samples/mpi_reduce_bench)
|
|
||||||
add_custom_target(copy-mpibench-script ALL
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy
|
|
||||||
${CMAKE_SOURCE_DIR}/samples/mpi_reduce_bench/mpibench.sh
|
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/mpibench.sh)
|
|
||||||
add_subdirectory(samples/bwtest)
|
add_subdirectory(samples/bwtest)
|
||||||
add_subdirectory(samples/genbenchmarkscripts)
|
add_subdirectory(samples/genbenchmarkscripts)
|
||||||
endif()
|
endif()
|
||||||
|
@@ -24,11 +24,11 @@ AC_bin_steps = 1000
|
|||||||
AC_bin_save_t = 1e666
|
AC_bin_save_t = 1e666
|
||||||
|
|
||||||
// Set to 0 if you want to run the simulation from the beginning, or just a new
|
// Set to 0 if you want to run the simulation from the beginning, or just a new
|
||||||
// simulation. If continuing from a saved step, specify the step number here.
|
// simulation. If continuing from a saved step, specify the step number here.
|
||||||
AC_start_step = 0
|
AC_start_step = 0
|
||||||
|
|
||||||
// Maximum time in code units. If negative, there is no time limit
|
// Maximum time in code units. If negative, there is no time limit
|
||||||
AC_max_time = -1.0
|
AC_max_time = -1.0
|
||||||
|
|
||||||
// Hydro
|
// Hydro
|
||||||
AC_cdt = 0.4
|
AC_cdt = 0.4
|
||||||
@@ -49,7 +49,7 @@ AC_forcing_magnitude = 1e-5
|
|||||||
AC_kmin = 0.8
|
AC_kmin = 0.8
|
||||||
AC_kmax = 1.2
|
AC_kmax = 1.2
|
||||||
// Switches forcing off and accretion on
|
// Switches forcing off and accretion on
|
||||||
AC_switch_accretion = 0
|
AC_switch_accretion = 0
|
||||||
|
|
||||||
// Entropy
|
// Entropy
|
||||||
AC_cp_sound = 1.0
|
AC_cp_sound = 1.0
|
||||||
|
@@ -39,8 +39,47 @@ typedef enum {
|
|||||||
NUM_TESTS,
|
NUM_TESTS,
|
||||||
} TestType;
|
} TestType;
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint64_t x, y, z;
|
||||||
|
} uint3_64;
|
||||||
|
|
||||||
|
static uint3_64
|
||||||
|
operator+(const uint3_64& a, const uint3_64& b)
|
||||||
|
{
|
||||||
|
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint3_64
|
||||||
|
morton3D(const uint64_t pid)
|
||||||
|
{
|
||||||
|
uint64_t i, j, k;
|
||||||
|
i = j = k = 0;
|
||||||
|
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << 3 * bit;
|
||||||
|
k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||||
|
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
||||||
|
i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (uint3_64){i, j, k};
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint3_64
|
||||||
|
decompose(const uint64_t target)
|
||||||
|
{
|
||||||
|
// This is just so beautifully elegant. Complex and efficient decomposition
|
||||||
|
// in just one line of code.
|
||||||
|
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
|
||||||
|
|
||||||
|
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
main(void)
|
main(int argc, char** argv)
|
||||||
{
|
{
|
||||||
MPI_Init(NULL, NULL);
|
MPI_Init(NULL, NULL);
|
||||||
int nprocs, pid;
|
int nprocs, pid;
|
||||||
@@ -51,9 +90,30 @@ main(void)
|
|||||||
AcMeshInfo info;
|
AcMeshInfo info;
|
||||||
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
||||||
|
|
||||||
|
if (argc > 1) {
|
||||||
|
if (argc == 4) {
|
||||||
|
const int nx = atoi(argv[1]);
|
||||||
|
const int ny = atoi(argv[2]);
|
||||||
|
const int nz = atoi(argv[3]);
|
||||||
|
info.int_params[AC_nx] = nx;
|
||||||
|
info.int_params[AC_ny] = ny;
|
||||||
|
info.int_params[AC_nz] = nz;
|
||||||
|
acUpdateBuiltinParams(&info);
|
||||||
|
printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const TestType test = TEST_STRONG_SCALING;
|
const TestType test = TEST_STRONG_SCALING;
|
||||||
if (test == TEST_WEAK_SCALING)
|
if (test == TEST_WEAK_SCALING) {
|
||||||
info.int_params[AC_nz] *= nprocs;
|
uint3_64 decomp = decompose(nprocs);
|
||||||
|
info.int_params[AC_nx] *= decomp.x;
|
||||||
|
info.int_params[AC_ny] *= decomp.y;
|
||||||
|
info.int_params[AC_nz] *= decomp.z;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
AcMesh model, candidate;
|
AcMesh model, candidate;
|
||||||
@@ -115,7 +175,7 @@ main(void)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Percentiles
|
// Percentiles
|
||||||
const size_t num_iters = 100;
|
const size_t num_iters = 1000;
|
||||||
const double nth_percentile = 0.90;
|
const double nth_percentile = 0.90;
|
||||||
std::vector<double> results; // ms
|
std::vector<double> results; // ms
|
||||||
results.reserve(num_iters);
|
results.reserve(num_iters);
|
||||||
|
@@ -5,5 +5,5 @@ find_package(OpenMP)
|
|||||||
find_package(CUDAToolkit)
|
find_package(CUDAToolkit)
|
||||||
|
|
||||||
add_executable(bwtest main.c)
|
add_executable(bwtest main.c)
|
||||||
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static)
|
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
|
||||||
target_compile_options(bwtest PRIVATE -O3)
|
target_compile_options(bwtest PRIVATE -O3)
|
||||||
|
@@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
|
|
||||||
|
#include <cuda.h> // CUDA driver API
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
|
|
||||||
#include "timer_hires.h" // From src/common
|
#include "timer_hires.h" // From src/common
|
||||||
@@ -13,7 +14,13 @@
|
|||||||
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
|
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
|
||||||
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
|
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
|
||||||
|
|
||||||
#define errchk(x) { if (!(x)) { fprintf(stderr, "errchk(%s) failed", #x); assert(x); }}
|
#define errchk(x) \
|
||||||
|
{ \
|
||||||
|
if (!(x)) { \
|
||||||
|
fprintf(stderr, "errchk(%s) failed", #x); \
|
||||||
|
assert(x); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Findings:
|
Findings:
|
||||||
@@ -56,6 +63,18 @@ allocDevice(const size_t bytes)
|
|||||||
static uint8_t*
|
static uint8_t*
|
||||||
allocDevicePinned(const size_t bytes)
|
allocDevicePinned(const size_t bytes)
|
||||||
{
|
{
|
||||||
|
#define USE_CUDA_DRIVER_PINNING (1)
|
||||||
|
#if USE_CUDA_DRIVER_PINNING
|
||||||
|
uint8_t* arr = allocDevice(bytes);
|
||||||
|
|
||||||
|
unsigned int flag = 1;
|
||||||
|
CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
|
||||||
|
(CUdeviceptr)arr);
|
||||||
|
|
||||||
|
errchk(retval == CUDA_SUCCESS);
|
||||||
|
return arr;
|
||||||
|
|
||||||
|
#else
|
||||||
uint8_t* arr;
|
uint8_t* arr;
|
||||||
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||||
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||||
@@ -65,8 +84,24 @@ allocDevicePinned(const size_t bytes)
|
|||||||
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||||
errchk(retval == cudaSuccess);
|
errchk(retval == cudaSuccess);
|
||||||
return arr;
|
return arr;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
static uint8_t*
|
||||||
|
allocDevicePinned(const size_t bytes)
|
||||||
|
{
|
||||||
|
uint8_t* arr;
|
||||||
|
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||||
|
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||||
|
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
|
||||||
|
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||||
|
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||||
|
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||||
|
errchk(retval == cudaSuccess);
|
||||||
|
return arr;
|
||||||
|
}*/
|
||||||
|
|
||||||
static void
|
static void
|
||||||
freeDevice(uint8_t* arr)
|
freeDevice(uint8_t* arr)
|
||||||
{
|
{
|
||||||
@@ -239,7 +274,6 @@ send_h2d(uint8_t* src, uint8_t* dst)
|
|||||||
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
|
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
|
sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
|
||||||
{
|
{
|
||||||
@@ -299,10 +333,10 @@ measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_
|
|||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
measurebw2(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
|
measurebw2(const char* msg, const size_t bytes,
|
||||||
uint8_t* hsrc, uint8_t* ddst)
|
void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
|
||||||
|
uint8_t* hsrc, uint8_t* ddst)
|
||||||
{
|
{
|
||||||
const size_t num_samples = 100;
|
const size_t num_samples = 100;
|
||||||
|
|
||||||
@@ -386,8 +420,8 @@ main(void)
|
|||||||
measurebw("Bidirectional bandwidth, twoway (Host)", //
|
measurebw("Bidirectional bandwidth, twoway (Host)", //
|
||||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||||
measurebw("Bidirectional bandwidth, async multiple (Host)", //
|
measurebw("Bidirectional bandwidth, async multiple (Host)", //
|
||||||
2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||||
//measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
|
// measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
|
||||||
// 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
// 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
||||||
|
|
||||||
freeHost(src);
|
freeHost(src);
|
||||||
@@ -406,11 +440,12 @@ main(void)
|
|||||||
measurebw("Bidirectional bandwidth, twoway (Device)", //
|
measurebw("Bidirectional bandwidth, twoway (Device)", //
|
||||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||||
measurebw("Bidirectional bandwidth, async multiple (Device)", //
|
measurebw("Bidirectional bandwidth, async multiple (Device)", //
|
||||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||||
//measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
|
// measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
|
||||||
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
||||||
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
|
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
|
||||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
|
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
|
||||||
|
dst);
|
||||||
|
|
||||||
freeDevice(src);
|
freeDevice(src);
|
||||||
freeDevice(dst);
|
freeDevice(dst);
|
||||||
@@ -428,7 +463,7 @@ main(void)
|
|||||||
measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
|
measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
|
||||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||||
measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
|
measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
|
||||||
2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||||
|
|
||||||
freeDevice(src);
|
freeDevice(src);
|
||||||
freeDevice(dst);
|
freeDevice(dst);
|
||||||
@@ -444,7 +479,8 @@ main(void)
|
|||||||
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
||||||
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
||||||
|
|
||||||
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
|
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
|
||||||
|
ddst);
|
||||||
|
|
||||||
freeDevice(dsrc);
|
freeDevice(dsrc);
|
||||||
freeDevice(ddst);
|
freeDevice(ddst);
|
||||||
@@ -462,7 +498,8 @@ main(void)
|
|||||||
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
||||||
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
||||||
|
|
||||||
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc, ddst);
|
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
|
||||||
|
hsrc, ddst);
|
||||||
|
|
||||||
freeDevice(dsrc);
|
freeDevice(dsrc);
|
||||||
freeDevice(ddst);
|
freeDevice(ddst);
|
||||||
|
@@ -29,6 +29,7 @@ main(void)
|
|||||||
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
|
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
|
||||||
fprintf(fp, "#SBATCH -n %d\n", nprocs);
|
fprintf(fp, "#SBATCH -n %d\n", nprocs);
|
||||||
fprintf(fp, "#SBATCH -N %d\n", nodes);
|
fprintf(fp, "#SBATCH -N %d\n", nodes);
|
||||||
|
fprintf(fp, "#SBATCH --exclusive\n");
|
||||||
|
|
||||||
// Modules
|
// Modules
|
||||||
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
||||||
@@ -36,8 +37,14 @@ main(void)
|
|||||||
|
|
||||||
// Profile and run
|
// Profile and run
|
||||||
fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
||||||
fprintf(fp, "srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark\n",
|
|
||||||
nprocs);
|
const int nx = 256; // max size 1792;
|
||||||
|
const int ny = nx;
|
||||||
|
const int nz = nx;
|
||||||
|
fprintf(fp,
|
||||||
|
//"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
|
||||||
|
//"%d\n",
|
||||||
|
"srun ./benchmark %d %d %d\n", nx, ny, nz);
|
||||||
|
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
|
@@ -2,7 +2,7 @@ find_package(CUDAToolkit)
|
|||||||
|
|
||||||
## Astaroth Core
|
## Astaroth Core
|
||||||
add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
|
add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
|
||||||
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart)
|
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels CUDA::cudart CUDA::cuda_driver)
|
||||||
|
|
||||||
## Options
|
## Options
|
||||||
if (MPI_ENABLED)
|
if (MPI_ENABLED)
|
||||||
|
@@ -10,7 +10,16 @@
|
|||||||
#include "kernels/kernels.h"
|
#include "kernels/kernels.h"
|
||||||
|
|
||||||
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
|
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
|
||||||
#define MPI_GPUDIRECT_DISABLED (0)
|
|
||||||
|
#define MPI_GPUDIRECT_DISABLED (0) // Buffer through host memory, deprecated
|
||||||
|
#define MPI_DECOMPOSITION_AXES (3)
|
||||||
|
#define MPI_COMPUTE_ENABLED (1)
|
||||||
|
#define MPI_COMM_ENABLED (1)
|
||||||
|
#define MPI_INCL_CORNERS (0)
|
||||||
|
#define MPI_USE_PINNED (1) // Do inter-node comm with pinned memory
|
||||||
|
#define MPI_USE_CUDA_DRIVER_PINNING (0) // Pin with cuPointerSetAttribute, otherwise cudaMallocHost
|
||||||
|
|
||||||
|
#include <cuda.h> // CUDA driver API (needed if MPI_USE_CUDA_DRIVER_PINNING is set)
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acDevicePrintInfo(const Device device)
|
acDevicePrintInfo(const Device device)
|
||||||
@@ -523,11 +532,32 @@ morton3D(const uint64_t pid)
|
|||||||
{
|
{
|
||||||
uint64_t i, j, k;
|
uint64_t i, j, k;
|
||||||
i = j = k = 0;
|
i = j = k = 0;
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
|
||||||
const uint64_t mask = 0x1l << 3 * bit;
|
if (MPI_DECOMPOSITION_AXES == 3) {
|
||||||
i |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
const uint64_t mask = 0x1l << 3 * bit;
|
||||||
k |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||||
|
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
||||||
|
i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Just a quick copy/paste for other decomp dims
|
||||||
|
else if (MPI_DECOMPOSITION_AXES == 2) {
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << 2 * bit;
|
||||||
|
j |= ((pid & (mask << 0)) >> 1 * bit) >> 0;
|
||||||
|
k |= ((pid & (mask << 1)) >> 1 * bit) >> 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (MPI_DECOMPOSITION_AXES == 1) {
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << 1 * bit;
|
||||||
|
k |= ((pid & (mask << 0)) >> 0 * bit) >> 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n");
|
||||||
|
ERRCHK_ALWAYS(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return (uint3_64){i, j, k};
|
return (uint3_64){i, j, k};
|
||||||
@@ -537,12 +567,33 @@ static uint64_t
|
|||||||
morton1D(const uint3_64 pid)
|
morton1D(const uint3_64 pid)
|
||||||
{
|
{
|
||||||
uint64_t i = 0;
|
uint64_t i = 0;
|
||||||
for (int bit = 0; bit <= 21; ++bit) {
|
|
||||||
const uint64_t mask = 0x1l << bit;
|
if (MPI_DECOMPOSITION_AXES == 3) {
|
||||||
i |= ((pid.x & mask) << 0) << 2 * bit;
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
i |= ((pid.y & mask) << 1) << 2 * bit;
|
const uint64_t mask = 0x1l << bit;
|
||||||
i |= ((pid.z & mask) << 2) << 2 * bit;
|
i |= ((pid.z & mask) << 0) << 2 * bit;
|
||||||
|
i |= ((pid.y & mask) << 1) << 2 * bit;
|
||||||
|
i |= ((pid.x & mask) << 2) << 2 * bit;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
else if (MPI_DECOMPOSITION_AXES == 2) {
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << bit;
|
||||||
|
i |= ((pid.y & mask) << 0) << 1 * bit;
|
||||||
|
i |= ((pid.z & mask) << 1) << 1 * bit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (MPI_DECOMPOSITION_AXES == 1) {
|
||||||
|
for (int bit = 0; bit <= 21; ++bit) {
|
||||||
|
const uint64_t mask = 0x1l << bit;
|
||||||
|
i |= ((pid.z & mask) << 0) << 0 * bit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "Invalid MPI_DECOMPOSITION_AXES\n");
|
||||||
|
ERRCHK_ALWAYS(0);
|
||||||
|
}
|
||||||
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -605,9 +656,18 @@ acCreatePackedData(const int3 dims)
|
|||||||
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
|
const size_t bytes = dims.x * dims.y * dims.z * sizeof(data.data[0]) * NUM_VTXBUF_HANDLES;
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data, bytes));
|
||||||
|
|
||||||
|
#if MPI_USE_CUDA_DRIVER_PINNING
|
||||||
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&data.data_pinned, bytes));
|
||||||
|
|
||||||
|
unsigned int flag = 1;
|
||||||
|
CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
|
||||||
|
(CUdeviceptr)data.data_pinned);
|
||||||
|
ERRCHK_ALWAYS(retval == CUDA_SUCCESS);
|
||||||
|
#else
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
|
ERRCHK_CUDA_ALWAYS(cudaMallocHost((void**)&data.data_pinned, bytes));
|
||||||
// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
|
// ERRCHK_CUDA_ALWAYS(cudaMallocManaged((void**)&data.data_pinned, bytes)); // Significantly
|
||||||
// slower than pinned (38 ms vs. 125 ms)
|
// slower than pinned (38 ms vs. 125 ms)
|
||||||
|
#endif // USE_CUDA_DRIVER_PINNING
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
@@ -1070,7 +1130,7 @@ acTransferCommData(const Device device, //
|
|||||||
const int npid = getPid(pid3d + neighbor, decomp);
|
const int npid = getPid(pid3d + neighbor, decomp);
|
||||||
|
|
||||||
PackedData* dst = &data->dsts[b0_idx];
|
PackedData* dst = &data->dsts[b0_idx];
|
||||||
if (onTheSameNode(pid, npid)) {
|
if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) {
|
||||||
MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
|
MPI_Irecv(dst->data, count, datatype, npid, b0_idx, //
|
||||||
MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
|
MPI_COMM_WORLD, &data->recv_reqs[b0_idx]);
|
||||||
dst->pinned = false;
|
dst->pinned = false;
|
||||||
@@ -1092,7 +1152,7 @@ acTransferCommData(const Device device, //
|
|||||||
const int npid = getPid(pid3d - neighbor, decomp);
|
const int npid = getPid(pid3d - neighbor, decomp);
|
||||||
|
|
||||||
PackedData* src = &data->srcs[b0_idx];
|
PackedData* src = &data->srcs[b0_idx];
|
||||||
if (onTheSameNode(pid, npid)) {
|
if (onTheSameNode(pid, npid) || !MPI_USE_PINNED) {
|
||||||
cudaStreamSynchronize(data->streams[b0_idx]);
|
cudaStreamSynchronize(data->streams[b0_idx]);
|
||||||
MPI_Isend(src->data, count, datatype, npid, b0_idx, //
|
MPI_Isend(src->data, count, datatype, npid, b0_idx, //
|
||||||
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
|
MPI_COMM_WORLD, &data->send_reqs[b0_idx]);
|
||||||
@@ -1131,6 +1191,8 @@ typedef struct {
|
|||||||
CommData sidexy_data;
|
CommData sidexy_data;
|
||||||
CommData sidexz_data;
|
CommData sidexz_data;
|
||||||
CommData sideyz_data;
|
CommData sideyz_data;
|
||||||
|
|
||||||
|
// int comm_cart;
|
||||||
} Grid;
|
} Grid;
|
||||||
|
|
||||||
static Grid grid = {};
|
static Grid grid = {};
|
||||||
@@ -1281,11 +1343,13 @@ AcResult
|
|||||||
acGridIntegrate(const Stream stream, const AcReal dt)
|
acGridIntegrate(const Stream stream, const AcReal dt)
|
||||||
{
|
{
|
||||||
ERRCHK(grid.initialized);
|
ERRCHK(grid.initialized);
|
||||||
// acGridSynchronizeStream(stream);
|
acGridSynchronizeStream(stream);
|
||||||
|
|
||||||
const Device device = grid.device;
|
const Device device = grid.device;
|
||||||
const int3 nn = grid.nn;
|
const int3 nn = grid.nn;
|
||||||
// CommData corner_data = grid.corner_data; // Do not rm: required for corners
|
#if MPI_INCL_CORNERS
|
||||||
|
CommData corner_data = grid.corner_data; // Do not rm: required for corners
|
||||||
|
#endif // MPI_INCL_CORNERS
|
||||||
CommData edgex_data = grid.edgex_data;
|
CommData edgex_data = grid.edgex_data;
|
||||||
CommData edgey_data = grid.edgey_data;
|
CommData edgey_data = grid.edgey_data;
|
||||||
CommData edgez_data = grid.edgez_data;
|
CommData edgez_data = grid.edgez_data;
|
||||||
@@ -1293,10 +1357,8 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
CommData sidexz_data = grid.sidexz_data;
|
CommData sidexz_data = grid.sidexz_data;
|
||||||
CommData sideyz_data = grid.sideyz_data;
|
CommData sideyz_data = grid.sideyz_data;
|
||||||
|
|
||||||
acDeviceSynchronizeStream(device, stream);
|
// Corners
|
||||||
|
#if MPI_INCL_CORNERS
|
||||||
// Corners
|
|
||||||
/*
|
|
||||||
// Do not rm: required for corners
|
// Do not rm: required for corners
|
||||||
const int3 corner_b0s[] = {
|
const int3 corner_b0s[] = {
|
||||||
(int3){0, 0, 0},
|
(int3){0, 0, 0},
|
||||||
@@ -1309,7 +1371,7 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
|
(int3){0, NGHOST + nn.y, NGHOST + nn.z},
|
||||||
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
|
(int3){NGHOST + nn.x, NGHOST + nn.y, NGHOST + nn.z},
|
||||||
};
|
};
|
||||||
*/
|
#endif // MPI_INCL_CORNERS
|
||||||
|
|
||||||
// Edges X
|
// Edges X
|
||||||
const int3 edgex_b0s[] = {
|
const int3 edgex_b0s[] = {
|
||||||
@@ -1357,85 +1419,99 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
};
|
};
|
||||||
|
|
||||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||||
// acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
|
acDeviceSynchronizeStream(device, STREAM_ALL);
|
||||||
acPackCommData(device, edgex_b0s, &edgex_data);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
acPackCommData(device, edgey_b0s, &edgey_data);
|
|
||||||
acPackCommData(device, edgez_b0s, &edgez_data);
|
#if MPI_COMPUTE_ENABLED
|
||||||
acPackCommData(device, sidexy_b0s, &sidexy_data);
|
acPackCommData(device, sidexy_b0s, &sidexy_data);
|
||||||
acPackCommData(device, sidexz_b0s, &sidexz_data);
|
acPackCommData(device, sidexz_b0s, &sidexz_data);
|
||||||
acPackCommData(device, sideyz_b0s, &sideyz_data);
|
acPackCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
|
#endif // MPI_COMPUTE_ENABLED
|
||||||
|
|
||||||
|
#if MPI_COMM_ENABLED
|
||||||
|
acTransferCommData(device, sidexy_b0s, &sidexy_data);
|
||||||
|
acTransferCommData(device, sidexz_b0s, &sidexz_data);
|
||||||
|
acTransferCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
|
#endif // MPI_COMM_ENABLED
|
||||||
|
|
||||||
|
#if MPI_COMPUTE_ENABLED
|
||||||
//////////// INNER INTEGRATION //////////////
|
//////////// INNER INTEGRATION //////////////
|
||||||
{
|
{
|
||||||
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
||||||
const int3 m2 = nn;
|
const int3 m2 = nn;
|
||||||
acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
|
acDeviceIntegrateSubstep(device, STREAM_16, isubstep, m1, m2, dt);
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////
|
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
acPackCommData(device, edgex_b0s, &edgex_data);
|
||||||
|
acPackCommData(device, edgey_b0s, &edgey_data);
|
||||||
|
acPackCommData(device, edgez_b0s, &edgez_data);
|
||||||
|
#endif // MPI_COMPUTE_ENABLED
|
||||||
|
|
||||||
#if MPI_GPUDIRECT_DISABLED
|
#if MPI_COMM_ENABLED
|
||||||
// acTransferCommDataToHost(device, &corner_data); // Do not rm: required for corners
|
acTransferCommDataWait(sidexy_data);
|
||||||
acTransferCommDataToHost(device, &edgex_data);
|
acUnpinCommData(device, &sidexy_data);
|
||||||
acTransferCommDataToHost(device, &edgey_data);
|
acTransferCommDataWait(sidexz_data);
|
||||||
acTransferCommDataToHost(device, &edgez_data);
|
acUnpinCommData(device, &sidexz_data);
|
||||||
acTransferCommDataToHost(device, &sidexy_data);
|
acTransferCommDataWait(sideyz_data);
|
||||||
acTransferCommDataToHost(device, &sidexz_data);
|
acUnpinCommData(device, &sideyz_data);
|
||||||
acTransferCommDataToHost(device, &sideyz_data);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
|
|
||||||
acTransferCommData(device, edgex_b0s, &edgex_data);
|
acTransferCommData(device, edgex_b0s, &edgex_data);
|
||||||
acTransferCommData(device, edgey_b0s, &edgey_data);
|
acTransferCommData(device, edgey_b0s, &edgey_data);
|
||||||
acTransferCommData(device, edgez_b0s, &edgez_data);
|
acTransferCommData(device, edgez_b0s, &edgez_data);
|
||||||
acTransferCommData(device, sidexy_b0s, &sidexy_data);
|
#endif // MPI_COMM_ENABLED
|
||||||
acTransferCommData(device, sidexz_b0s, &sidexz_data);
|
|
||||||
acTransferCommData(device, sideyz_b0s, &sideyz_data);
|
|
||||||
|
|
||||||
// acTransferCommDataWait(corner_data); // Do not rm: required for corners
|
#if MPI_COMPUTE_ENABLED
|
||||||
acTransferCommDataWait(edgex_data);
|
#if MPI_INCL_CORNERS
|
||||||
acTransferCommDataWait(edgey_data);
|
acPackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
|
||||||
acTransferCommDataWait(edgez_data);
|
#endif // MPI_INCL_CORNERS
|
||||||
acTransferCommDataWait(sidexy_data);
|
|
||||||
acTransferCommDataWait(sidexz_data);
|
|
||||||
acTransferCommDataWait(sideyz_data);
|
|
||||||
|
|
||||||
#if MPI_GPUDIRECT_DISABLED
|
|
||||||
// acTransferCommDataToDevice(device, &corner_data); // Do not rm: required for corners
|
|
||||||
acTransferCommDataToDevice(device, &edgex_data);
|
|
||||||
acTransferCommDataToDevice(device, &edgey_data);
|
|
||||||
acTransferCommDataToDevice(device, &edgez_data);
|
|
||||||
acTransferCommDataToDevice(device, &sidexy_data);
|
|
||||||
acTransferCommDataToDevice(device, &sidexz_data);
|
|
||||||
acTransferCommDataToDevice(device, &sideyz_data);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// acUnpinCommData(device, &corner_data); // Do not rm: required for corners
|
|
||||||
acUnpinCommData(device, &edgex_data);
|
|
||||||
acUnpinCommData(device, &edgey_data);
|
|
||||||
acUnpinCommData(device, &edgez_data);
|
|
||||||
acUnpinCommData(device, &sidexy_data);
|
|
||||||
acUnpinCommData(device, &sidexz_data);
|
|
||||||
acUnpinCommData(device, &sideyz_data);
|
|
||||||
|
|
||||||
// acUnpackCommData(device, corner_b0s, &corner_data);
|
|
||||||
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
|
||||||
acUnpackCommData(device, edgey_b0s, &edgey_data);
|
|
||||||
acUnpackCommData(device, edgez_b0s, &edgez_data);
|
|
||||||
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
|
acUnpackCommData(device, sidexy_b0s, &sidexy_data);
|
||||||
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
|
acUnpackCommData(device, sidexz_b0s, &sidexz_data);
|
||||||
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
|
acUnpackCommData(device, sideyz_b0s, &sideyz_data);
|
||||||
//////////// OUTER INTEGRATION //////////////
|
#endif // MPI_COMPUTE_ENABLED
|
||||||
|
|
||||||
|
#if MPI_COMM_ENABLED
|
||||||
|
acTransferCommDataWait(edgex_data);
|
||||||
|
acUnpinCommData(device, &edgex_data);
|
||||||
|
acTransferCommDataWait(edgey_data);
|
||||||
|
acUnpinCommData(device, &edgey_data);
|
||||||
|
acTransferCommDataWait(edgez_data);
|
||||||
|
acUnpinCommData(device, &edgez_data);
|
||||||
|
|
||||||
|
#if MPI_INCL_CORNERS
|
||||||
|
acTransferCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
|
||||||
|
#endif // MPI_INCL_CORNERS
|
||||||
|
#endif // MPI_COMM_ENABLED
|
||||||
|
|
||||||
|
#if MPI_COMPUTE_ENABLED
|
||||||
|
acUnpackCommData(device, edgex_b0s, &edgex_data);
|
||||||
|
acUnpackCommData(device, edgey_b0s, &edgey_data);
|
||||||
|
acUnpackCommData(device, edgez_b0s, &edgez_data);
|
||||||
|
#endif // MPI_COMPUTE_ENABLED
|
||||||
|
|
||||||
|
#if MPI_COMM_ENABLED
|
||||||
|
#if MPI_INCL_CORNERS
|
||||||
|
acTransferCommDataWait(corner_data); // Do not rm: required for corners
|
||||||
|
acUnpinCommData(device, &corner_data); // Do not rm: required for corners
|
||||||
|
#endif // MPI_INCL_CORNERS
|
||||||
|
#endif // MPI_COMM_ENABLED
|
||||||
|
#if MPI_COMPUTE_ENABLED
|
||||||
|
#if MPI_INCL_CORNERS
|
||||||
|
acUnpackCommData(device, corner_b0s, &corner_data); // Do not rm: required for corners
|
||||||
|
#endif // MPI_INCL_CORNERS
|
||||||
|
#endif // MPI_COMPUTE_ENABLED
|
||||||
|
|
||||||
// Wait for unpacking
|
// Wait for unpacking
|
||||||
// acSyncCommData(corner_data); // Do not rm: required for corners
|
|
||||||
acSyncCommData(edgex_data);
|
|
||||||
acSyncCommData(edgey_data);
|
|
||||||
acSyncCommData(edgez_data);
|
|
||||||
acSyncCommData(sidexy_data);
|
acSyncCommData(sidexy_data);
|
||||||
acSyncCommData(sidexz_data);
|
acSyncCommData(sidexz_data);
|
||||||
acSyncCommData(sideyz_data);
|
acSyncCommData(sideyz_data);
|
||||||
|
acSyncCommData(edgex_data);
|
||||||
|
acSyncCommData(edgey_data);
|
||||||
|
acSyncCommData(edgez_data);
|
||||||
|
#if MPI_INCL_CORNERS
|
||||||
|
acSyncCommData(corner_data); // Do not rm: required for corners
|
||||||
|
#endif // MPI_INCL_CORNERS
|
||||||
|
|
||||||
|
#if MPI_COMPUTE_ENABLED
|
||||||
{ // Front
|
{ // Front
|
||||||
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
|
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
|
||||||
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
|
const int3 m2 = m1 + (int3){nn.x, nn.y, NGHOST};
|
||||||
@@ -1466,11 +1542,13 @@ acGridIntegrate(const Stream stream, const AcReal dt)
|
|||||||
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
|
const int3 m2 = m1 + (int3){NGHOST, nn.y - 2 * NGHOST, nn.z - 2 * NGHOST};
|
||||||
acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
|
acDeviceIntegrateSubstep(device, STREAM_5, isubstep, m1, m2, dt);
|
||||||
}
|
}
|
||||||
|
#endif // MPI_COMPUTE_ENABLED
|
||||||
acDeviceSwapBuffers(device);
|
acDeviceSwapBuffers(device);
|
||||||
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
|
|
||||||
////////////////////////////////////////////
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Does not have to be STREAM_ALL, only the streams used with
|
||||||
|
// acDeviceIntegrateSubstep (less likely to break this way though)
|
||||||
|
acDeviceSynchronizeStream(device, STREAM_ALL); // Wait until inner and outer done
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1620,82 +1698,4 @@ acGridPeriodicBoundconds(const Stream stream)
|
|||||||
acSyncCommData(sideyz_data);
|
acSyncCommData(sideyz_data);
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AcResult
|
|
||||||
acMPIReduceScal(const AcReal local_result, const ReductionType rtype, AcReal* result)
|
|
||||||
{
|
|
||||||
|
|
||||||
MPI_Op op;
|
|
||||||
if (rtype == RTYPE_MAX) {
|
|
||||||
op = MPI_MAX;
|
|
||||||
}
|
|
||||||
else if (rtype == RTYPE_MIN) {
|
|
||||||
op = MPI_MIN;
|
|
||||||
}
|
|
||||||
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
|
|
||||||
op = MPI_SUM;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
ERROR("Unrecognised rtype");
|
|
||||||
}
|
|
||||||
|
|
||||||
#if AC_DOUBLE_PRECISION == 1
|
|
||||||
MPI_Datatype datatype = MPI_DOUBLE;
|
|
||||||
#else
|
|
||||||
MPI_Datatype datatype = MPI_FLOAT;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int rank;
|
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
||||||
|
|
||||||
int world_size;
|
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
|
|
||||||
|
|
||||||
AcReal mpi_res;
|
|
||||||
MPI_Reduce(&local_result, &mpi_res, 1, datatype, op, 0, MPI_COMM_WORLD);
|
|
||||||
if (rank == 0){
|
|
||||||
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
|
||||||
const AcReal inv_n = AcReal(1.) / (grid.nn.x * grid.decomposition.x * grid.nn.y *
|
|
||||||
grid.decomposition.y * grid.nn.z * grid.decomposition.z);
|
|
||||||
mpi_res = sqrt(inv_n * mpi_res);
|
|
||||||
}
|
|
||||||
*result = mpi_res;
|
|
||||||
}
|
|
||||||
return AC_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
AcResult
|
|
||||||
acGridReduceScal(const Stream stream, const ReductionType rtype,
|
|
||||||
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
|
||||||
{
|
|
||||||
ERRCHK(grid.initialized);
|
|
||||||
|
|
||||||
const Device device = grid.device;
|
|
||||||
|
|
||||||
acGridSynchronizeStream(STREAM_ALL);
|
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
AcReal local_result;
|
|
||||||
acDeviceReduceScal(device, stream, rtype, vtxbuf_handle, &local_result);
|
|
||||||
|
|
||||||
return acMPIReduceScal(local_result, rtype, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
AcResult
|
|
||||||
acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vtxbuf0,
|
|
||||||
const VertexBufferHandle vtxbuf1, const VertexBufferHandle vtxbuf2, AcReal* result)
|
|
||||||
{
|
|
||||||
ERRCHK(grid.initialized);
|
|
||||||
|
|
||||||
const Device device = grid.device;
|
|
||||||
|
|
||||||
acGridSynchronizeStream(STREAM_ALL);
|
|
||||||
//MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
AcReal local_result;
|
|
||||||
acDeviceReduceVec(device, stream, rtype, vtxbuf0, vtxbuf1, vtxbuf2, &local_result);
|
|
||||||
|
|
||||||
return acMPIReduceScal(local_result, rtype, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // AC_MPI_ENABLED
|
#endif // AC_MPI_ENABLED
|
||||||
|
@@ -41,10 +41,12 @@ static __device__ __forceinline__ AcReal3
|
|||||||
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
||||||
const AcReal3 rate_of_change, const AcReal dt)
|
const AcReal3 rate_of_change, const AcReal dt)
|
||||||
{
|
{
|
||||||
return (AcReal3){
|
return (AcReal3){rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x,
|
||||||
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
dt),
|
||||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y,
|
||||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
dt),
|
||||||
|
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z,
|
||||||
|
dt)};
|
||||||
}
|
}
|
||||||
|
|
||||||
#define rk3(state_previous, state_current, rate_of_change, dt) \
|
#define rk3(state_previous, state_current, rate_of_change, dt) \
|
||||||
@@ -192,9 +194,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if VERBOSE_PRINTING
|
#if VERBOSE_PRINTING
|
||||||
printf(
|
printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f "
|
||||||
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
|
"ms\n",
|
||||||
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
|
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
|
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
|
||||||
@@ -204,6 +206,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
rk3_tpb = best_dims;
|
rk3_tpb = best_dims;
|
||||||
|
|
||||||
|
// Failed to find valid thread block dimensions
|
||||||
|
ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0);
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -75,17 +75,20 @@ exp(const acComplex& val)
|
|||||||
{
|
{
|
||||||
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
|
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
|
||||||
}
|
}
|
||||||
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
|
static __device__ inline acComplex
|
||||||
|
operator*(const AcReal& a, const acComplex& b)
|
||||||
{
|
{
|
||||||
return (acComplex){a * b.x, a * b.y};
|
return (acComplex){a * b.x, a * b.y};
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
|
static __device__ inline acComplex
|
||||||
|
operator*(const acComplex& b, const AcReal& a)
|
||||||
{
|
{
|
||||||
return (acComplex){a * b.x, a * b.y};
|
return (acComplex){a * b.x, a * b.y};
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
|
static __device__ inline acComplex
|
||||||
|
operator*(const acComplex& a, const acComplex& b)
|
||||||
{
|
{
|
||||||
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
|
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
|
||||||
}
|
}
|
||||||
@@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal
|
|||||||
|
|
||||||
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
||||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal
|
|||||||
|
|
||||||
const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
|
const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
|
||||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara
|
|||||||
|
|
||||||
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
|
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
|
||||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
|
if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
|
||||||
fprintf(
|
fprintf(stderr,
|
||||||
stderr,
|
"WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. "
|
||||||
"WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n",
|
"Skipping.\n",
|
||||||
value.x, value.y, value.z, int3param_names[param]);
|
value.x, value.y, value.z, int3param_names[param]);
|
||||||
return AC_FAILURE;
|
return AC_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device)
|
|||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
// Scalar
|
// Scalar
|
||||||
#define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE);
|
#define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE);
|
||||||
AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM)
|
AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM)
|
||||||
|
@@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
|
|||||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||||
|
|
||||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
|
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)],
|
||||||
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
src1[IDX(src_idx)],
|
||||||
|
src2[IDX(src_idx)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <ReduceFunc reduce>
|
template <ReduceFunc reduce>
|
||||||
|
@@ -38,7 +38,7 @@
|
|||||||
#define LMAGNETIC (1)
|
#define LMAGNETIC (1)
|
||||||
#define LENTROPY (1)
|
#define LENTROPY (1)
|
||||||
#define LTEMPERATURE (0)
|
#define LTEMPERATURE (0)
|
||||||
#define LFORCING (0)
|
#define LFORCING (1)
|
||||||
#define LUPWD (1)
|
#define LUPWD (1)
|
||||||
#define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
|
#define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
|
||||||
|
|
||||||
@@ -103,11 +103,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds)
|
|||||||
#elif STENCIL_ORDER == 4
|
#elif STENCIL_ORDER == 4
|
||||||
const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
|
const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
|
||||||
#elif STENCIL_ORDER == 6
|
#elif STENCIL_ORDER == 6
|
||||||
const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0),
|
const Scalar coefficients[] = {
|
||||||
(Scalar)(1.0 / 60.0)};
|
0,
|
||||||
|
(Scalar)(3.0 / 4.0),
|
||||||
|
(Scalar)(-3.0 / 20.0),
|
||||||
|
(Scalar)(1.0 / 60.0),
|
||||||
|
};
|
||||||
#elif STENCIL_ORDER == 8
|
#elif STENCIL_ORDER == 8
|
||||||
const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0),
|
const Scalar coefficients[] = {
|
||||||
(Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)};
|
0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0),
|
||||||
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MID (STENCIL_ORDER / 2)
|
#define MID (STENCIL_ORDER / 2)
|
||||||
@@ -126,15 +131,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds)
|
|||||||
#if STENCIL_ORDER == 2
|
#if STENCIL_ORDER == 2
|
||||||
const Scalar coefficients[] = {-2, 1};
|
const Scalar coefficients[] = {-2, 1};
|
||||||
#elif STENCIL_ORDER == 4
|
#elif STENCIL_ORDER == 4
|
||||||
const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0),
|
const Scalar coefficients[] = {
|
||||||
(Scalar)(-1.0 / 12.0)};
|
(Scalar)(-5.0 / 2.0),
|
||||||
|
(Scalar)(4.0 / 3.0),
|
||||||
|
(Scalar)(-1.0 / 12.0),
|
||||||
|
};
|
||||||
#elif STENCIL_ORDER == 6
|
#elif STENCIL_ORDER == 6
|
||||||
const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0),
|
const Scalar coefficients[] = {
|
||||||
(Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)};
|
(Scalar)(-49.0 / 18.0),
|
||||||
|
(Scalar)(3.0 / 2.0),
|
||||||
|
(Scalar)(-3.0 / 20.0),
|
||||||
|
(Scalar)(1.0 / 90.0),
|
||||||
|
};
|
||||||
#elif STENCIL_ORDER == 8
|
#elif STENCIL_ORDER == 8
|
||||||
const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),
|
const Scalar coefficients[] = {
|
||||||
(Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0),
|
(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), (Scalar)(-1.0 / 5.0),
|
||||||
(Scalar)(-1.0 / 560.0)};
|
(Scalar)(8.0 / 315.0), (Scalar)(-1.0 / 560.0),
|
||||||
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MID (STENCIL_ORDER / 2)
|
#define MID (STENCIL_ORDER / 2)
|
||||||
@@ -156,16 +169,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in
|
|||||||
const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
|
const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
|
||||||
#elif STENCIL_ORDER == 4
|
#elif STENCIL_ORDER == 4
|
||||||
const Scalar coefficients[] = {
|
const Scalar coefficients[] = {
|
||||||
0, (Scalar)(1.0 / 32.0),
|
0,
|
||||||
(Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders
|
(Scalar)(1.0 / 32.0),
|
||||||
|
(Scalar)(1.0 / 64.0),
|
||||||
|
}; // TODO correct coefficients, these are just placeholders
|
||||||
#elif STENCIL_ORDER == 6
|
#elif STENCIL_ORDER == 6
|
||||||
const Scalar fac = ((Scalar)(1. / 720.));
|
const Scalar fac = ((Scalar)(1. / 720.));
|
||||||
const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac,
|
const Scalar coefficients[] = {
|
||||||
(Scalar)(2.0) * fac};
|
0 * fac,
|
||||||
|
(Scalar)(270.0) * fac,
|
||||||
|
(Scalar)(-27.0) * fac,
|
||||||
|
(Scalar)(2.0) * fac,
|
||||||
|
};
|
||||||
#elif STENCIL_ORDER == 8
|
#elif STENCIL_ORDER == 8
|
||||||
const Scalar fac = ((Scalar)(1. / 20160.));
|
const Scalar fac = ((Scalar)(1. / 20160.));
|
||||||
const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac,
|
const Scalar coefficients[] = {
|
||||||
(Scalar)(128.) * fac, (Scalar)(-9.) * fac};
|
0 * fac,
|
||||||
|
(Scalar)(8064.) * fac,
|
||||||
|
(Scalar)(-1008.) * fac,
|
||||||
|
(Scalar)(128.) * fac,
|
||||||
|
(Scalar)(-9.) * fac,
|
||||||
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MID (STENCIL_ORDER / 2)
|
#define MID (STENCIL_ORDER / 2)
|
||||||
@@ -207,14 +231,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr)
|
|||||||
Scalar pencil_a[STENCIL_ORDER + 1];
|
Scalar pencil_a[STENCIL_ORDER + 1];
|
||||||
//#pragma unroll
|
//#pragma unroll
|
||||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||||
pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2,
|
pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
|
||||||
k)];
|
j + offset - STENCIL_ORDER / 2, k)];
|
||||||
|
|
||||||
Scalar pencil_b[STENCIL_ORDER + 1];
|
Scalar pencil_b[STENCIL_ORDER + 1];
|
||||||
//#pragma unroll
|
//#pragma unroll
|
||||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||||
pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset,
|
pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
|
||||||
k)];
|
j + STENCIL_ORDER / 2 - offset, k)];
|
||||||
|
|
||||||
return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
|
return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
|
||||||
}
|
}
|
||||||
@@ -539,7 +563,8 @@ gradient_of_divergence(const VectorData vec)
|
|||||||
return (Vector){
|
return (Vector){
|
||||||
hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
|
hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
|
||||||
hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
|
hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
|
||||||
hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]};
|
hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2],
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Takes uu gradients and returns S
|
// Takes uu gradients and returns S
|
||||||
@@ -805,10 +830,11 @@ forcing(int3 globalVertexIdx, Scalar dt)
|
|||||||
getInt(AC_ny) * getReal(AC_dsy),
|
getInt(AC_ny) * getReal(AC_dsy),
|
||||||
getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
|
getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
|
||||||
(void)a; // WARNING: not used
|
(void)a; // WARNING: not used
|
||||||
Vector xx = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
|
Vector xx = (Vector){
|
||||||
(globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
|
(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
|
||||||
(globalVertexIdx.z - getInt(AC_nz_min)) *
|
(globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
|
||||||
getReal(AC_dsz)}; // sink (current index)
|
(globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz),
|
||||||
|
}; // sink (current index)
|
||||||
const Scalar cs2 = getReal(AC_cs2_sound);
|
const Scalar cs2 = getReal(AC_cs2_sound);
|
||||||
const Scalar cs = sqrt(cs2);
|
const Scalar cs = sqrt(cs2);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user