I have inspected the branch and it complies and functions fine in my system. NOTE: I could not test MPI at the moment, but it should not prevent the merge.

Merged in mpi-to-master-merge-candidate-2020-06-01 (pull request #12)

Astaroth 2.3
This commit is contained in:
jpekkila
2020-08-19 06:54:34 +00:00
committed by Miikka Väisälä
36 changed files with 2748 additions and 601 deletions

View File

@@ -1,5 +1,8 @@
## CMake settings
cmake_minimum_required(VERSION 3.9) # Required for first-class CUDA support
# V3.9 required for first-class CUDA support
# V3.17 required for the FindCUDAToolkit package
# V3.18 required for CMAKE_CUDA_ARCHITECTURES
cmake_minimum_required(VERSION 3.18)
find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)
@@ -14,12 +17,19 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
set(CMAKE_C_STANDARD 11)
set(CMAKE_CXX_STANDARD 11)
find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")
## CUDA
# GPU, compute capability
# K40, 3.5
# K80, 3.7
# P100, 6.0
# V100, 7.0
if (NOT CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 60 70) # Default
else ()
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) # User-specified
endif()
string (REPLACE " " "," CUDA_COMMON_FLAGS "${COMMON_FLAGS}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options=${CUDA_COMMON_FLAGS}")
## Build type
if(NOT CMAKE_BUILD_TYPE)
@@ -53,7 +63,8 @@ message(STATUS "AC module dir: ${DSL_MODULE_DIR}")
file(GLOB DSL_SOURCES ${DSL_MODULE_DIR}/*
${CMAKE_SOURCE_DIR}/acc/stdlib/*)
set(DSL_HEADERS "${PROJECT_BINARY_DIR}/user_kernels.h"
"${PROJECT_BINARY_DIR}/user_defines.h")
"${PROJECT_BINARY_DIR}/user_defines.h"
"${PROJECT_BINARY_DIR}/astaroth.f90")
add_custom_command (
COMMENT "Building ACC objects ${DSL_MODULE_DIR}"
@@ -79,6 +90,11 @@ include_directories(src/common) # Common headers
include_directories(${CMAKE_BINARY_DIR}) # DSL headers
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # CUDA headers
if (MPI_ENABLED)
find_package(MPI REQUIRED)
include_directories(${MPI_CXX_INCLUDE_DIRS})
endif()
## Subdirectories
add_subdirectory(src/utils)
add_subdirectory(src/core/kernels)
@@ -89,6 +105,10 @@ if (BUILD_SAMPLES)
add_subdirectory(samples/cpptest)
add_subdirectory(samples/mpitest)
add_subdirectory(samples/benchmark)
add_subdirectory(samples/bwtest)
add_subdirectory(samples/genbenchmarkscripts)
add_subdirectory(samples/mpi_reduce_bench)
add_subdirectory(samples/fortrantest)
endif()
if (BUILD_STANDALONE)

View File

@@ -40,6 +40,7 @@ In the base directory, run
| Option | Description | Default |
|--------|-------------|---------|
| CMAKE_BUILD_TYPE | Selects the build type. Possible values: Debug, Release, RelWithDebInfo, MinSizeRel. See (CMake documentation)[https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html] for more details. | Release |
| CUDA_ARCHITECTURES | Selects CUDA architecture support. Multiple architectures delimited by `;`. See (CMake documentation)[https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html] for more details. | "60;70" |
| DOUBLE_PRECISION | Generates double precision code. | OFF |
| BUILD_SAMPLES | Builds projects in samples subdirectory. | OFF |
| BUILD_STANDALONE | Builds a standalone library for testing, benchmarking and simulation. | ON |
@@ -67,6 +68,7 @@ See `analysis/python/` directory of existing data visualization and analysis scr
* `astaroth/include/astaroth.h`: Astaroth main header. Contains the interface for accessing single- and multi-GPU layers.
* `astaroth/include/astaroth_utils.h`: Utility library header. Provides functions for performing common tasks on host, such as allocating and verifying meshes.
* `<build directory>/astaroth.f90`: Fortran interface to Astaroth. Generated when building the library.
## FAQ

View File

@@ -39,9 +39,11 @@ ASTNode* root = NULL;
// Output files
static FILE* DSLHEADER = NULL;
static FILE* CUDAHEADER = NULL;
static FILE* FHEADER = NULL;
static const char* dslheader_filename = "user_defines.h";
static const char* cudaheader_filename = "user_kernels.h";
static const char* fheader_filename = "astaroth.f90";
// Forward declaration of yyparse
int yyparse(void);
@@ -98,7 +100,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
['<'] = "<",
['>'] = ">",
['!'] = "!",
['.'] = "."};
['.'] = ".",
};
static const char*
translate(const int token)
@@ -261,9 +264,8 @@ traverse(const ASTNode* node)
if (typequal->token == KERNEL) {
fprintf(CUDAHEADER, "GEN_KERNEL_PARAM_BOILERPLATE");
if (node->lhs != NULL) {
fprintf(
stderr,
"Syntax error: function parameters for Kernel functions not allowed!\n");
fprintf(stderr, "Syntax error: function parameters for Kernel functions not "
"allowed!\n");
}
}
else if (typequal->token == PREPROCESSED) {
@@ -597,65 +599,173 @@ generate_preprocessed_structures(void)
}
static void
generate_header(void)
generate_headers(void)
{
// Fortran interface
const char* fortran_interface = R"(
! -*-f90-*- (for emacs) vim:set filetype=fortran: (for vim)
! Utils (see astaroth_fortran.cc for definitions)
external acupdatebuiltinparams
external acgetdevicecount
! Device interface (see astaroth_fortran.cc for definitions)
external acdevicecreate, acdevicedestroy
external acdeviceprintinfo
external acdeviceloadmeshinfo
external acdeviceloadmesh, acdevicestoremesh
external acdeviceintegratesubstep
external acdeviceperiodicboundconds
external acdeviceswapbuffers
external acdevicereducescal, acdevicereducevec
external acdevicesynchronizestream
)";
fprintf(FHEADER, "%s\n", fortran_interface);
fprintf(DSLHEADER, "#pragma once\n");
// Int params
fprintf(DSLHEADER, "#define AC_FOR_USER_INT_PARAM_TYPES(FUNC)");
int enumcounter = 0;
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
if (symbol_table[i].type_specifier == INT && symbol_table[i].type_qualifier == UNIFORM) {
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
enumcounter);
++enumcounter;
}
}
fprintf(DSLHEADER, "\n\n");
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_INT_PARAMS = %d\n\n", enumcounter);
// Int3 params
fprintf(DSLHEADER, "#define AC_FOR_USER_INT3_PARAM_TYPES(FUNC)");
enumcounter = 0;
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
if (symbol_table[i].type_specifier == INT3 && symbol_table[i].type_qualifier == UNIFORM) {
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
enumcounter);
++enumcounter;
}
}
fprintf(DSLHEADER, "\n\n");
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_INT3_PARAMS = %d\n\n", enumcounter);
// Scalar params
fprintf(DSLHEADER, "#define AC_FOR_USER_REAL_PARAM_TYPES(FUNC)");
enumcounter = 0;
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
if (symbol_table[i].type_specifier == SCALAR && symbol_table[i].type_qualifier == UNIFORM) {
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
enumcounter);
++enumcounter;
}
}
fprintf(DSLHEADER, "\n\n");
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_REAL_PARAMS = %d\n\n", enumcounter);
// Vector params
fprintf(DSLHEADER, "#define AC_FOR_USER_REAL3_PARAM_TYPES(FUNC)");
enumcounter = 0;
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
if (symbol_table[i].type_specifier == VECTOR && symbol_table[i].type_qualifier == UNIFORM) {
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
enumcounter);
++enumcounter;
}
}
fprintf(DSLHEADER, "\n\n");
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_REAL3_PARAMS = %d\n\n", enumcounter);
// Scalar fields
fprintf(DSLHEADER, "#define AC_FOR_VTXBUF_HANDLES(FUNC)");
enumcounter = 0;
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
if (symbol_table[i].type_specifier == SCALARFIELD &&
symbol_table[i].type_qualifier == UNIFORM) {
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
enumcounter);
++enumcounter;
}
}
fprintf(DSLHEADER, "\n\n");
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_VTXBUF_HANDLES = %d\n\n", enumcounter);
// Scalar arrays
fprintf(DSLHEADER, "#define AC_FOR_SCALARARRAY_HANDLES(FUNC)");
enumcounter = 0;
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
if (symbol_table[i].type_specifier == SCALARARRAY &&
symbol_table[i].type_qualifier == UNIFORM) {
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
enumcounter);
++enumcounter;
}
}
fprintf(DSLHEADER, "\n\n");
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter);
// Streams
const size_t nstreams = 20;
for (size_t i = 0; i < nstreams; ++i) {
fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i);
fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i);
}
fprintf(DSLHEADER, "#define NUM_STREAMS (%lu)\n", nstreams);
fprintf(DSLHEADER, "#define STREAM_DEFAULT (STREAM_0)\n");
fprintf(DSLHEADER, "#define STREAM_ALL (NUM_STREAMS)\n");
fprintf(FHEADER, "integer(c_int), parameter :: NUM_STREAMS = %lu\n", nstreams);
fprintf(FHEADER, "integer(c_int), parameter :: STREAM_DEFAULT = STREAM_0\n");
fprintf(FHEADER, "integer(c_int), parameter :: STREAM_ALL = NUM_STREAMS\n");
fprintf(DSLHEADER, "typedef int Stream;\n\n");
// Reduction types
size_t counter = 0;
fprintf(DSLHEADER, "#define RTYPE_MAX (%lu)\n", counter);
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MAX = %lu\n", counter);
++counter;
fprintf(DSLHEADER, "#define RTYPE_MIN (%lu)\n", counter);
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MIN = %lu\n", counter);
++counter;
fprintf(DSLHEADER, "#define RTYPE_RMS (%lu)\n", counter);
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_RMS = %lu\n", counter);
++counter;
fprintf(DSLHEADER, "#define RTYPE_RMS_EXP (%lu)\n", counter);
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_RMS_EXP = %lu\n", counter);
++counter;
fprintf(DSLHEADER, "#define RTYPE_SUM (%lu)\n", counter);
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_SUM = %lu\n", counter);
++counter;
fprintf(DSLHEADER, "typedef int ReductionType;\n");
fprintf(DSLHEADER, "#define NUM_REDUCTION_TYPES (%lu)\n", counter);
fprintf(FHEADER, "integer(c_int), parameter :: NUM_REDUCTION_TYPES = %lu\n", counter);
// Fortran structs
const char* fortran_structs = R"(
type, bind(C) :: AcMeshInfo
integer(c_int), dimension(AC_NUM_INT_PARAMS) :: int_params
integer(c_int), dimension(AC_NUM_INT3_PARAMS, 3) :: int3_params
real, dimension(AC_NUM_REAL_PARAMS) :: real_params
real, dimension(AC_NUM_REAL3_PARAMS, 3) :: real3_params
end type AcMeshInfo
)";
fprintf(FHEADER, "%s\n", fortran_structs);
}
static void
@@ -681,20 +791,21 @@ main(void)
DSLHEADER = fopen(dslheader_filename, "w+");
CUDAHEADER = fopen(cudaheader_filename, "w+");
FHEADER = fopen(fheader_filename, "w+");
assert(DSLHEADER);
assert(CUDAHEADER);
assert(FHEADER);
// Add built-in param symbols
for (size_t i = 0; i < ARRAY_SIZE(builtin_int_params); ++i) {
for (size_t i = 0; i < ARRAY_SIZE(builtin_int_params); ++i)
add_symbol(SYMBOLTYPE_OTHER, UNIFORM, INT, builtin_int_params[i]);
}
for (size_t i = 0; i < ARRAY_SIZE(builtin_int3_params); ++i) {
for (size_t i = 0; i < ARRAY_SIZE(builtin_int3_params); ++i)
add_symbol(SYMBOLTYPE_OTHER, UNIFORM, INT3, builtin_int3_params[i]);
}
// Generate
traverse(root);
generate_header();
generate_headers();
generate_preprocessed_structures();
generate_library_hooks();
@@ -703,9 +814,12 @@ main(void)
// Cleanup
fclose(DSLHEADER);
fclose(CUDAHEADER);
fclose(FHEADER);
astnode_destroy(root);
fprintf(stdout, "-- Generated %s\n", dslheader_filename);
fprintf(stdout, "-- Generated %s\n", cudaheader_filename);
fprintf(stdout, "-- Generated %s\n", fheader_filename);
return EXIT_SUCCESS;
}

View File

@@ -12,6 +12,8 @@ image: nvidia/cuda
# ==> Updating the kernel drivers by ourselves probably requires creating our own docker image.
# ===> Which might not even work since I don't know what kind of hardware we're running on (lspci was not available)
options:
max-time: 5 # Max time allowed for building (minutes)
pipelines:
# default: # Default is run at every push but we have only 500 build minutes / month so that probably wouldn't work out
custom: # Manual/scheduled building only
@@ -20,7 +22,11 @@ pipelines:
script: # Modify the commands below to build your repository.
- mkdir -p build && cd build
- apt-get update
- apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev
- apt-get install -y apt-transport-https ca-certificates gnupg software-properties-common wget
- wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
- apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
- apt-get update
- apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev gfortran
- cmake -DDSL_MODULE_DIR="acc/mhd_solver" -DBUILD_STANDALONE=ON -DBUILD_UTILS=ON -DBUILD_RT_VISUALIZATION=OFF -DBUILD_SAMPLES=ON -DDOUBLE_PRECISION=OFF -DMULTIGPU_ENABLED=ON -DMPI_ENABLED=OFF .. # Single precision
- make -j
- rm -rf *

View File

@@ -24,11 +24,11 @@ AC_bin_steps = 1000
AC_bin_save_t = 1e666
// Set to 0 if you want to run the simulation from the beginning, or just a new
// simulation. If continuing from a saved step, specify the step number here.
AC_start_step = 0
// simulation. If continuing from a saved step, specify the step number here.
AC_start_step = 0
// Maximum time in code units. If negative, there is no time limit
AC_max_time = -1.0
AC_max_time = -1.0
// Hydro
AC_cdt = 0.4
@@ -49,7 +49,7 @@ AC_forcing_magnitude = 1e-5
AC_kmin = 0.8
AC_kmax = 1.2
// Switches forcing off and accretion on
AC_switch_accretion = 0
AC_switch_accretion = 0
// Entropy
AC_cp_sound = 1.0

View File

@@ -79,7 +79,7 @@ typedef enum {
```
The API is divided into layers which differ in the level of control provided over the execution.
There are two primary layers:
There are three primary layers:
* Device layer
* Functions start with acDevice.
@@ -92,7 +92,13 @@ There are two primary layers:
* All functions are asynchronous and executed concurrently on all devices in the node.
* Subsequent functions called in the same stream (see Section #Streams and synchronization) are guaranteed to be synchronous.
Finally, a third layer is provided for convenience and backwards compatibility.
* Grid layer
* Functions start with acGrid.
* Provides control over all devices on multiple node.
* Requires MPI. `MPI_Init()` must be called before calling any acGrid functions.
* Streams are used to control concurrency the same way as on acDevice and acNode layers.
Finally, a fourth layer is provided for convenience and backwards compatibility.
* Astaroth layer (deprecated)
* Functions start with `ac` only, f.ex. acInit().
@@ -126,6 +132,12 @@ AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* co
AcResult acNodeAutoOptimize(const Node node);
```
Grid layer.
```C
AcResult acGridInit(const AcMeshInfo info);
AcResult acGridQuit(void);
```
General helper functions.
```C
size_t acVertexBufferSize(const AcMeshInfo info);
@@ -159,6 +171,7 @@ AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices);
AcResult acGridLoadMesh(const AcMesh host_mesh, const Stream stream);
```
Storing meshes and vertex buffer to host memory.
@@ -180,6 +193,7 @@ AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices,
AcMesh* host_mesh);
AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
```
Transferring data between devices
@@ -242,6 +256,13 @@ AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionT
AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
AcResult acGridIntegrate(const Stream stream, const AcReal dt);
AcResult acGridPeriodicBoundconds(const Stream stream);
AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
```
Finally, there's a library function that is automatically generated for all user-specified `Kernel`
@@ -261,10 +282,13 @@ yet completed. Therefore special care must be taken in order to ensure proper sy
Synchronization is done using `Stream` primitives, defined as
```C
typedef enum { STREAM_DEFAULT, STREAM_0, ..., STREAM_16, NUM_STREAMS } Stream;
typedef enum {STREAM_0, ..., STREAM_15, NUM_STREAMS};
#define STREAM_DEFAULT (STREAM_0)
#define STREAM_ALL (NUM_STREAMS)
```
> **Note:** There is guaranteed to be at least 16 distinct streams.
Functions queued in the same stream will be executed sequentially. If two or more consequent
functions are queued in different streams, then these functions may execute in parallel. For
additional control over streams, there is a barrier synchronization function which blocks execution
@@ -286,6 +310,7 @@ Astaroth API provides the following functions for barrier synchronization.
AcResult acSynchronize(void);
AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
AcResult acGridSynchronizeStream(const Stream stream);
```
## Data Synchronization
@@ -408,13 +433,16 @@ int mz = info.int_params[AC_mz];
after initialization.
### Decomposition
Grids and subgrids contain the dimensions of the the mesh decomposed to multiple devices.
### Decomposition (`acNode` layer)
> **Note:** This section describes implementation details specific to the acNode layer. The acGrid layer is not related to the `GridDims` structure described here.
`GridDims` contains the dimensions of the the mesh decomposed to multiple devices.
```C
typedef struct {
int3 m; // Size of the simulation domain (includes the ghost zones)
int3 n; // Size of the computational domain (without ghost zones)
} Grid;
} GridDims;
```
As briefly discussed in the section Data synchronization, a `Mesh` is distributed to multiple
@@ -427,8 +455,6 @@ Let *i* be the device id. The portion of the halos shared by neighboring devices
`acNodeSynchronizeVertexBuffer` and `acNodeSynchronizeMesh` communicate these shared areas among
the devices in the node.
> **Note:** The decomposition scheme is subject to change.
# Astaroth Domain-Specific Language
We designed the Astaroth Domain-specific Language (DSL) for expressing stencil computations in a
@@ -478,8 +504,8 @@ In addition to basic datatypes in C/C++/CUDA, such as int and int3, we provide t
| Complex | A tuple of two 32- or 64-bit floating-point numbers. The real part is stored in member .x, while the imaginary component is in .y. Basic operations, such as multiplication, are defined as built-in functions. | std::complex<float> or std::complex<double> |
| Matrix | A tuple of three Vectors. Is stored in column-major order, f.ex. Matrix[i][j] is the component on row i, column j. (TODO recheck specs.) | float3[3] or double3[3] |
| ScalarArray | A one-dimensional array of Scalars stored in device memory. Given mesh dimensions (mx, my, mz), consists of max(mx, max(my, mz)) elements. | float[] or double[] |
| ScalarField | An abstraction of a three-dimensional scalar field stored in device memory. Is implemented as a handle to a one-dimensional Scalar array consisting of input and output segments. The data is stored linearly in order i + j * mx + k * mx * my, given some vertex index (i, j, k) and mesh constisting of (mx, my, mz) vertices. | float[2][] or double[2][] |
| VectorField | An abstraction of a three-dimensional vector field stored in device memory. Is implemented as a tuple of three ScalarField handles. | Three distinct float[2][] or double[2][] arrays for each component. Stored as a structure of arrays. |
| ScalarField | A three-dimensional scalar field stored in row-wise scan order where coordinates `(i, j, k)` correspond to a one-dimensional index `i + j * mx + k * mx * my`. Consists of two buffers, one used for input and another one for output. | Two float[] or double[] arrays |
| VectorField | A three-dimensional vector field. Consists of three `ScalarFields`. | Three `ScalarFields` stored contiguously in memory as a structure of arrays |
## Precision

View File

@@ -38,7 +38,7 @@ PROJECT_NAME = "Astaroth"
# could be handy for archiving the generated documentation or if some version
# control system is used.
PROJECT_NUMBER = 2.2
PROJECT_NUMBER = 2.3
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a

View File

@@ -50,6 +50,8 @@ typedef struct {
typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
/*
// Deprecated, defined during code generation
typedef enum {
RTYPE_MAX,
RTYPE_MIN,
@@ -58,29 +60,7 @@ typedef enum {
RTYPE_SUM,
NUM_REDUCTION_TYPES
} ReductionType;
typedef enum {
STREAM_DEFAULT,
STREAM_0,
STREAM_1,
STREAM_2,
STREAM_3,
STREAM_4,
STREAM_5,
STREAM_6,
STREAM_7,
STREAM_8,
STREAM_9,
STREAM_10,
STREAM_11,
STREAM_12,
STREAM_13,
STREAM_14,
STREAM_15,
STREAM_16,
NUM_STREAMS
} Stream;
#define STREAM_ALL (NUM_STREAMS)
*/
#define AC_GEN_ID(X) X,
typedef enum {
@@ -320,6 +300,15 @@ AcResult acGridIntegrate(const Stream stream, const AcReal dt);
/** */
AcResult acGridPeriodicBoundconds(const Stream stream);
/** TODO */
AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
/** TODO */
AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
#endif // AC_MPI_ENABLED
/*
@@ -573,6 +562,20 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const
/** */
AcResult acDeviceRunMPITest(void);
/*
* =============================================================================
* Helper functions
* =============================================================================
*/
/** Updates the built-in parameters based on nx, ny and nz */
AcResult acUpdateBuiltinParams(AcMeshInfo* config);
/** Creates a mesh stored in host memory */
AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
/** Destroys a mesh stored in host memory */
AcResult acMeshDestroy(AcMesh* mesh);
#ifdef __cplusplus
} // extern "C"
#endif

View File

@@ -29,17 +29,57 @@
extern "C" {
#endif
#include <stdbool.h>
#define ERROR_LABEL_LENGTH 30
typedef struct {
char label[ERROR_LABEL_LENGTH];
VertexBufferHandle handle;
AcReal model;
AcReal candidate;
long double abs_error;
long double ulp_error;
long double rel_error;
AcReal maximum_magnitude;
AcReal minimum_magnitude;
} Error;
typedef struct {
char label[ERROR_LABEL_LENGTH];
VertexBufferHandle vtxbuf;
ReductionType rtype;
AcReal candidate;
} AcScalReductionTestCase;
typedef struct {
char label[ERROR_LABEL_LENGTH];
VertexBufferHandle a;
VertexBufferHandle b;
VertexBufferHandle c;
ReductionType rtype;
AcReal candidate;
} AcVecReductionTestCase;
/** TODO comment */
Error acGetError(AcReal model, AcReal candidate);
/** TODO comment */
bool printErrorToScreen(const Error error);
/** Loads data from the config file */
AcResult acLoadConfig(const char* config_path, AcMeshInfo* config);
/** Updates the built-in parameters based on nx, ny and nz */
AcResult acUpdateBuiltinParams(AcMeshInfo* config);
/** */
AcScalReductionTestCase acCreateScalReductionTestCase(const char* label,
const VertexBufferHandle vtxbuf,
const ReductionType rtype);
/** */
AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
/** */
AcResult acMeshDestroy(AcMesh* mesh);
AcVecReductionTestCase acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a,
const VertexBufferHandle b,
const VertexBufferHandle c,
const ReductionType rtype);
/** */
AcResult acMeshSet(const AcReal value, AcMesh* mesh);
@@ -56,9 +96,24 @@ AcResult acMeshClear(AcMesh* mesh);
/** */
AcResult acModelIntegrateStep(AcMesh mesh, const AcReal dt);
/** TODO */
AcReal acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a);
/** TODO */
AcReal acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
const VertexBufferHandle b, const VertexBufferHandle c);
/** */
AcResult acVerifyMesh(const AcMesh model, const AcMesh candidate);
/** */
AcResult acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases,
const size_t numCases);
/** */
AcResult acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases,
const size_t numCases);
#ifdef __cplusplus
} // extern "C"
#endif

View File

@@ -39,8 +39,47 @@ typedef enum {
NUM_TESTS,
} TestType;
#include <stdint.h>
typedef struct {
uint64_t x, y, z;
} uint3_64;
static uint3_64
operator+(const uint3_64& a, const uint3_64& b)
{
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
}
static uint3_64
morton3D(const uint64_t pid)
{
uint64_t i, j, k;
i = j = k = 0;
for (int bit = 0; bit <= 21; ++bit) {
const uint64_t mask = 0x1l << 3 * bit;
k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
}
return (uint3_64){i, j, k};
}
static uint3_64
decompose(const uint64_t target)
{
// This is just so beautifully elegant. Complex and efficient decomposition
// in just one line of code.
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
return p;
}
int
main(void)
main(int argc, char** argv)
{
MPI_Init(NULL, NULL);
int nprocs, pid;
@@ -51,9 +90,30 @@ main(void)
AcMeshInfo info;
acLoadConfig(AC_DEFAULT_CONFIG, &info);
if (argc > 1) {
if (argc == 4) {
const int nx = atoi(argv[1]);
const int ny = atoi(argv[2]);
const int nz = atoi(argv[3]);
info.int_params[AC_nx] = nx;
info.int_params[AC_ny] = ny;
info.int_params[AC_nz] = nz;
acUpdateBuiltinParams(&info);
printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
}
else {
fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
exit(EXIT_FAILURE);
}
}
const TestType test = TEST_STRONG_SCALING;
if (test == TEST_WEAK_SCALING)
info.int_params[AC_nz] *= nprocs;
if (test == TEST_WEAK_SCALING) {
uint3_64 decomp = decompose(nprocs);
info.int_params[AC_nx] *= decomp.x;
info.int_params[AC_ny] *= decomp.y;
info.int_params[AC_nz] *= decomp.z;
}
/*
AcMesh model, candidate;
@@ -89,19 +149,45 @@ main(void)
}
}*/
/*
// Basic
const size_t num_iters = 100;
// Warmup
for (size_t i = 0; i < 10; ++i)
for (size_t i = 0; i < num_iters / 10; ++i)
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
// Benchmark
Timer t;
const AcReal dt = FLT_EPSILON;
const size_t num_iters = 100;
const double nth_percentile = 0.90;
const AcReal dt = FLT_EPSILON;
acGridSynchronizeStream(STREAM_ALL);
timer_reset(&t);
acGridSynchronizeStream(STREAM_ALL);
for (size_t i = 0; i < num_iters; ++i)
acGridIntegrate(STREAM_DEFAULT, dt);
acGridSynchronizeStream(STREAM_ALL);
if (!pid)
timer_diff_print(t);
acGridSynchronizeStream(STREAM_ALL);
*/
// Percentiles
const size_t num_iters = 1000;
const double nth_percentile = 0.90;
std::vector<double> results; // ms
results.reserve(num_iters);
// Warmup
for (size_t i = 0; i < num_iters / 10; ++i)
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
// Benchmark
Timer t;
const AcReal dt = FLT_EPSILON;
for (size_t i = 0; i < num_iters; ++i) {
acGridSynchronizeStream(STREAM_ALL);
timer_reset(&t);
@@ -109,9 +195,9 @@ main(void)
acGridIntegrate(STREAM_DEFAULT, dt);
acGridSynchronizeStream(STREAM_ALL);
results.push_back(timer_diff_nsec(t) / 1e6);
acGridSynchronizeStream(STREAM_ALL);
}
// Write benchmark to file
if (!pid) {
std::sort(results.begin(), results.end(),
[](const double& a, const double& b) { return a < b; });
@@ -121,22 +207,58 @@ main(void)
results[nth_percentile * num_iters], 100 * nth_percentile);
char path[4096] = "";
if (test == TEST_STRONG_SCALING)
strncpy(path, "strong_scaling.csv", sizeof(path));
else if (test == TEST_WEAK_SCALING)
strncpy(path, "weak_scaling.csv", sizeof(path));
else
ERROR("Invalid test type");
sprintf(path, "%s_%d.csv", test == TEST_STRONG_SCALING ? "strong" : "weak", nprocs);
FILE* fp = fopen(path, "a");
ERRCHK_ALWAYS(fp);
// Format
// nprocs, measured (ms)
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
// nprocs, min, 50th perc, 90th perc, max
fprintf(fp, "%d, %g, %g, %g, %g\n", nprocs, results[0], results[0.5 * num_iters], results[nth_percentile * num_iters], results[num_iters-1]);
fclose(fp);
}
/*
const size_t num_iters = 1000;
const double nth_percentile = 0.90;
std::vector<double> results; // ms
results.reserve(num_iters);
for (size_t i = 0; i < num_iters; ++i) {
acGridSynchronizeStream(STREAM_ALL);
timer_reset(&t);
acGridSynchronizeStream(STREAM_ALL);
acGridIntegrate(STREAM_DEFAULT, dt);
acGridSynchronizeStream(STREAM_ALL);
results.push_back(timer_diff_nsec(t) / 1e6);
}
// Write benchmark to file
if (!pid) {
std::sort(results.begin(), results.end(),
[](const double& a, const double& b) { return a < b; });
fprintf(stdout,
"Integration step time %g ms (%gth "
"percentile)--------------------------------------\n",
results[nth_percentile * num_iters], 100 * nth_percentile);
char path[4096] = "";
if (test == TEST_STRONG_SCALING)
strncpy(path, "strong_scaling.csv", sizeof(path));
else if (test == TEST_WEAK_SCALING)
strncpy(path, "weak_scaling.csv", sizeof(path));
else
ERROR("Invalid test type");
FILE* fp = fopen(path, "a");
ERRCHK_ALWAYS(fp);
// Format
// nprocs, measured (ms)
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
fclose(fp);
}*/
acGridQuit();
MPI_Finalize();
return EXIT_SUCCESS;

View File

@@ -0,0 +1,9 @@
cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking
find_package(MPI)
find_package(OpenMP)
find_package(CUDAToolkit)
add_executable(bwtest main.c)
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
target_compile_options(bwtest PRIVATE -O3)

525
samples/bwtest/main.c Normal file
View File

@@ -0,0 +1,525 @@
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <cuda.h> // CUDA driver API
#include <cuda_runtime_api.h>
#include "timer_hires.h" // From src/common
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
#define errchk(x) \
{ \
if (!(x)) { \
fprintf(stderr, "errchk(%s) failed", #x); \
assert(x); \
} \
}
/*
Findings:
- MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
- Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
- For internode one should use pinned memory (RDMA is staged through pinned, gives full
network speed iff pinned)
- Both the sending and receiving arrays must be pinned to see performance improvement
in internode comm
*/
static uint8_t*
allocHost(const size_t bytes)
{
uint8_t* arr = malloc(bytes);
errchk(arr);
return arr;
}
static void
freeHost(uint8_t* arr)
{
free(arr);
}
static uint8_t*
allocDevice(const size_t bytes)
{
uint8_t* arr;
// Standard (20 GiB/s internode, 85 GiB/s intranode)
const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
// const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
errchk(retval == cudaSuccess);
return arr;
}
static uint8_t*
allocDevicePinned(const size_t bytes)
{
#define USE_CUDA_DRIVER_PINNING (1)
#if USE_CUDA_DRIVER_PINNING
uint8_t* arr = allocDevice(bytes);
unsigned int flag = 1;
CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
(CUdeviceptr)arr);
errchk(retval == CUDA_SUCCESS);
return arr;
#else
uint8_t* arr;
// Standard (20 GiB/s internode, 85 GiB/s intranode)
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
errchk(retval == cudaSuccess);
return arr;
#endif
}
/*
static uint8_t*
allocDevicePinned(const size_t bytes)
{
uint8_t* arr;
// Standard (20 GiB/s internode, 85 GiB/s intranode)
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
errchk(retval == cudaSuccess);
return arr;
}*/
static void
freeDevice(uint8_t* arr)
{
cudaFree(arr);
}
static void
sendrecv_blocking(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int nfront = (pid + 1) % nprocs;
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
if (!pid) {
MPI_Status status;
MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
}
else {
MPI_Status status;
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
}
}
static void
sendrecv_nonblocking(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int nfront = (pid + 1) % nprocs;
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
MPI_Request recv_request, send_request;
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request);
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request);
MPI_Status status;
MPI_Wait(&recv_request, &status);
MPI_Wait(&send_request, &status);
}
static void
sendrecv_twoway(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int nfront = (pid + 1) % nprocs;
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
MPI_Status status;
MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback,
MPI_COMM_WORLD, &status);
}
static void
sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Request recv_requests[nprocs], send_requests[nprocs];
for (int i = 1; i < nprocs; ++i) {
int nfront = (pid + i) % nprocs;
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
}
for (int i = 1; i < nprocs; ++i) {
MPI_Status status;
MPI_Wait(&recv_requests[i], &status);
MPI_Wait(&send_requests[i], &status);
}
}
/*
static void
sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Request send_requests[nprocs];
for (int i = 1; i < nprocs; ++i) {
int nfront = (pid + i) % nprocs;
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
}
static bool error_shown = false;
if (!pid && !error_shown) {
fprintf(stderr, "\tWARNING: make sure you init MPI_Init_thread for OpenMP support (no "
"supported on puhti atm "
"2020-04-05\n");
error_shown = true;
}
#pragma omp parallel for
for (int i = 1; i < nprocs; ++i) {
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
MPI_Status status;
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &status);
}
for (int i = 1; i < nprocs; ++i) {
MPI_Status status;
MPI_Wait(&send_requests[i], &status);
}
}
*/
static void
sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
{
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
static uint8_t* src_pinned = NULL;
static uint8_t* dst_pinned = NULL;
if (!src_pinned)
src_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
if (!dst_pinned)
dst_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
int devices_per_node = -1;
cudaGetDeviceCount(&devices_per_node);
MPI_Request recv_requests[nprocs], send_requests[nprocs];
for (int i = 1; i < nprocs; ++i) {
int nfront = (pid + i) % nprocs;
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
if (nback / devices_per_node != pid / devices_per_node) // Not on the same node
MPI_Irecv(dst_pinned, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD,
&recv_requests[i]);
else
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
if (nfront / devices_per_node != pid / devices_per_node) // Not on the same node
MPI_Isend(src_pinned, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD,
&send_requests[i]);
else
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
}
for (int i = 1; i < nprocs; ++i) {
MPI_Status status;
MPI_Wait(&recv_requests[i], &status);
MPI_Wait(&send_requests[i], &status);
}
}
static void
send_d2h(uint8_t* src, uint8_t* dst)
{
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
}
static void
send_h2d(uint8_t* src, uint8_t* dst)
{
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
}
static void
sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
{
cudaStream_t d2h, h2d;
cudaStreamCreate(&d2h);
cudaStreamCreate(&h2d);
cudaMemcpyAsync(hdst, dsrc, BLOCK_SIZE, cudaMemcpyDeviceToHost, d2h);
cudaMemcpyAsync(ddst, hsrc, BLOCK_SIZE, cudaMemcpyHostToDevice, h2d);
cudaStreamSynchronize(d2h);
cudaStreamSynchronize(h2d);
cudaStreamDestroy(d2h);
cudaStreamDestroy(h2d);
}
#define PRINT \
if (!pid) \
printf
static void
measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
uint8_t* dst)
{
const size_t num_samples = 100;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
PRINT("%s\n", msg);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("\tWarming up... ");
for (size_t i = 0; i < num_samples / 10; ++i)
sendrecv(src, dst);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("Done\n");
PRINT("\tBandwidth... ");
fflush(stdout);
Timer t;
MPI_Barrier(MPI_COMM_WORLD);
timer_reset(&t);
MPI_Barrier(MPI_COMM_WORLD);
for (size_t i = 0; i < num_samples; ++i)
sendrecv(src, dst);
MPI_Barrier(MPI_COMM_WORLD);
const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
MPI_Barrier(MPI_COMM_WORLD);
}
static void
measurebw2(const char* msg, const size_t bytes,
void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
uint8_t* hsrc, uint8_t* ddst)
{
const size_t num_samples = 100;
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
PRINT("%s\n", msg);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("\tWarming up... ");
for (size_t i = 0; i < num_samples / 10; ++i)
sendrecv(dsrc, hdst, hsrc, ddst);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("Done\n");
PRINT("\tBandwidth... ");
fflush(stdout);
Timer t;
MPI_Barrier(MPI_COMM_WORLD);
timer_reset(&t);
MPI_Barrier(MPI_COMM_WORLD);
for (size_t i = 0; i < num_samples; ++i)
sendrecv(dsrc, hdst, hsrc, ddst);
MPI_Barrier(MPI_COMM_WORLD);
const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
MPI_Barrier(MPI_COMM_WORLD);
}
int
main(void)
{
MPI_Init(NULL, NULL);
// int provided;
// MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
// errchk(provided >= MPI_THREAD_MULTIPLE);
// Disable stdout buffering
setbuf(stdout, NULL);
int pid, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
errchk(nprocs >= 2); // Require at least one neighbor
MPI_Barrier(MPI_COMM_WORLD);
if (!pid) {
printf("Do we have threads? The following should not be ordered (unless very lucky)\n");
#pragma omp parallel for
for (int i = 0; i < 10; ++i)
printf("%d, ", i);
printf("\n");
}
MPI_Barrier(MPI_COMM_WORLD);
int devices_per_node = -1;
cudaGetDeviceCount(&devices_per_node);
const int device_id = pid % devices_per_node;
cudaSetDevice(device_id);
printf("Process %d of %d running.\n", pid, nprocs);
MPI_Barrier(MPI_COMM_WORLD);
PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
#if 1
{
uint8_t* src = allocHost(BLOCK_SIZE);
uint8_t* dst = allocHost(BLOCK_SIZE);
measurebw("Unidirectional bandwidth, blocking (Host)", //
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
measurebw("Bidirectional bandwidth, async (Host)", //
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
measurebw("Bidirectional bandwidth, twoway (Host)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Host)", //
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
// measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
// 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
freeHost(src);
freeHost(dst);
}
PRINT("\n------------------------\n");
{
uint8_t* src = allocDevice(BLOCK_SIZE);
uint8_t* dst = allocDevice(BLOCK_SIZE);
measurebw("Unidirectional bandwidth, blocking (Device)", //
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
measurebw("Bidirectional bandwidth, async (Device)", //
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
measurebw("Bidirectional bandwidth, twoway (Device)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device)", //
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
// measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
dst);
freeDevice(src);
freeDevice(dst);
}
PRINT("\n------------------------\n");
{
uint8_t* src = allocDevicePinned(BLOCK_SIZE);
uint8_t* dst = allocDevicePinned(BLOCK_SIZE);
measurebw("Unidirectional bandwidth, blocking (Device, pinned)", //
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
measurebw("Bidirectional bandwidth, async (Device, pinned)", //
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
freeDevice(src);
freeDevice(dst);
}
PRINT("\n------------------------\n");
{
uint8_t* hsrc = allocHost(BLOCK_SIZE);
uint8_t* hdst = allocHost(BLOCK_SIZE);
uint8_t* dsrc = allocDevice(BLOCK_SIZE);
uint8_t* ddst = allocDevice(BLOCK_SIZE);
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
ddst);
freeDevice(dsrc);
freeDevice(ddst);
freeHost(hsrc);
freeHost(hdst);
}
PRINT("\n------------------------\n");
{
uint8_t* hsrc = allocHost(BLOCK_SIZE);
uint8_t* hdst = allocHost(BLOCK_SIZE);
uint8_t* dsrc = allocDevicePinned(BLOCK_SIZE);
uint8_t* ddst = allocDevicePinned(BLOCK_SIZE);
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
hsrc, ddst);
freeDevice(dsrc);
freeDevice(ddst);
freeHost(hsrc);
freeHost(hdst);
}
PRINT("\n------------------------\n");
#else
{ // Final run for easy identification with the profiler
uint8_t* src = allocDevice(BLOCK_SIZE);
uint8_t* dst = allocDevice(BLOCK_SIZE);
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
freeDevice(src);
freeDevice(dst);
}
#endif
MPI_Finalize();
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,4 @@
enable_language(Fortran)
add_executable(fortrantest main.f90)
target_link_libraries(fortrantest astaroth_core)

View File

@@ -0,0 +1,23 @@
program pc
use, intrinsic :: iso_c_binding
implicit none
include "astaroth.f90"
type(AcMeshInfo) :: info
type(c_ptr) :: device
print *, "Num int params"
print *, AC_NUM_INT_PARAMS
! Setup config
info%int_params(AC_nx + 1) = 128
info%int_params(AC_ny + 1) = 128
info%int_params(AC_nz + 1) = 128
call acupdatebuiltinparams(info)
call acdevicecreate(0, info, device)
call acdeviceprintinfo(device)
call acdevicedestroy(device)
end program

View File

@@ -0,0 +1,8 @@
add_executable(genbenchmarkscripts main.c)
add_custom_command(
TARGET genbenchmarkscripts POST_BUILD
COMMAND genbenchmarkscripts
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
COMMENT "Generating benchmark scripts"
)

View File

@@ -0,0 +1,62 @@
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int
main(void)
{
const int max_nprocs = 128;
for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
char filename[4096];
sprintf(filename, "benchmark_%d.sh", nprocs);
FILE* fp = fopen(filename, "w");
assert(fp);
// Boilerplate
fprintf(fp, "#!/bin/bash\n");
fprintf(fp, "#BATCH --job-name=astaroth\n");
fprintf(fp, "#SBATCH --account=project_2000403\n");
fprintf(fp, "#SBATCH --time=00:14:59\n");
fprintf(fp, "#SBATCH --mem=32000\n");
fprintf(fp, "#SBATCH --partition=gpu\n");
fprintf(fp, "#SBATCH --cpus-per-task=10\n");
// nprocs, nodes, gpus
const int max_gpus_per_node = 4;
const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
const int nodes = (int)ceil((double)nprocs / max_gpus_per_node);
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
fprintf(fp, "#SBATCH -n %d\n", nprocs);
fprintf(fp, "#SBATCH -N %d\n", nodes);
//fprintf(fp, "#SBATCH --exclusive\n");
if (nprocs > 4)
fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
// Modules
// OpenMPI
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n");
// HPCX
//fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
// Profile and run
//fprintf(fp, "mkdir -p profile_%d\n", nprocs);
const int nx = 256; // max size 1792;
const int ny = nx;
const int nz = nx;
/*
fprintf(fp,
//"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
//"%d\n",
"srun ./benchmark %d %d %d\n", nx, ny, nz);
*/
fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
fclose(fp);
}
return EXIT_SUCCESS;
}

View File

@@ -0,0 +1,3 @@
## benchmark
add_executable(mpi_reduce_bench main.cc)
target_link_libraries(mpi_reduce_bench astaroth_core astaroth_utils)

View File

@@ -0,0 +1,169 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
Running: benchmark -np <num processes> <executable>
*/
#include "astaroth.h"
#include "astaroth_utils.h"
#include "errchk.h"
#include "timer_hires.h"
#if AC_MPI_ENABLED
#include <mpi.h>
#include <algorithm>
#include <string.h>
#include <string>
#include <vector>
#include <chrono>
#include <ctime>
int
main(int argc, char** argv)
{
MPI_Init(NULL, NULL);
int nprocs, pid;
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
// CPU alloc
AcMeshInfo info;
acLoadConfig(AC_DEFAULT_CONFIG, &info);
char* benchmark_label;
if (argc > 1){
benchmark_label = argv[1];
} else {
auto timestamp = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
benchmark_label = std::ctime(&timestamp);
benchmark_label[strcspn(benchmark_label, "\n")] = '\0';
}
//clang-format off
std::vector<AcScalReductionTestCase> scalarReductionTests {
acCreateScalReductionTestCase("Scalar MAX" , VTXBUF_UUX, RTYPE_MAX),
acCreateScalReductionTestCase("Scalar MIN" , VTXBUF_UUX, RTYPE_MIN),
acCreateScalReductionTestCase("Scalar RMS" , VTXBUF_UUX, RTYPE_RMS),
acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
acCreateScalReductionTestCase("Scalar SUM" , VTXBUF_UUX, RTYPE_SUM)
};
std::vector<AcVecReductionTestCase> vectorReductionTests {
acCreateVecReductionTestCase("Vector MAX" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
acCreateVecReductionTestCase("Vector MIN" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
acCreateVecReductionTestCase("Vector RMS" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP),
acCreateVecReductionTestCase("Vector SUM" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM)
};
//clang-format on
// GPU alloc & compute
acGridInit(info);
const size_t num_iters = 100;
const double nth_percentile = 0.90;
std::vector<double> results; // ms
Timer t;
// Scalar benchmarks
for (auto& testCase : scalarReductionTests) {
results.clear();
results.reserve(num_iters);
for (size_t i = 0; i < num_iters; ++i) {
acGridSynchronizeStream(STREAM_ALL);
timer_reset(&t);
acGridSynchronizeStream(STREAM_ALL);
acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
acGridSynchronizeStream(STREAM_ALL);
results.push_back(timer_diff_nsec(t) / 1e6);
acGridSynchronizeStream(STREAM_ALL);
}
if (!pid) {
std::sort(results.begin(), results.end(),
[](const double& a, const double& b) { return a < b; });
fprintf(stdout,
"Reduction time %g ms (%gth "
"percentile)--------------------------------------\n",
results[nth_percentile * num_iters], 100 * nth_percentile);
char path[4096] = "mpi_reduction_benchmark.csv";
FILE* fp = fopen(path, "a");
ERRCHK_ALWAYS(fp);
// Format
// benchmark label, test label, nprocs, measured (ms)
fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
fclose(fp);
}
}
// Vector benchmarks
for (auto& testCase : vectorReductionTests) {
results.clear();
results.reserve(num_iters);
for (size_t i = 0; i < num_iters; ++i) {
acGridSynchronizeStream(STREAM_ALL);
timer_reset(&t);
acGridSynchronizeStream(STREAM_ALL);
acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c, &testCase.candidate);
acGridSynchronizeStream(STREAM_ALL);
results.push_back(timer_diff_nsec(t) / 1e6);
acGridSynchronizeStream(STREAM_ALL);
}
if (!pid) {
std::sort(results.begin(), results.end(),
[](const double& a, const double& b) { return a < b; });
fprintf(stdout,
"Reduction time %g ms (%gth "
"percentile)--------------------------------------\n",
results[nth_percentile * num_iters], 100 * nth_percentile);
char path[4096] = "mpi_reduction_benchmark.csv";
FILE* fp = fopen(path, "a");
ERRCHK_ALWAYS(fp);
// Format
// benchmark label, test label, nprocs, measured (ms)
fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
fclose(fp);
}
}
acGridQuit();
MPI_Finalize();
return EXIT_SUCCESS;
}
#else
int
main(void)
{
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
"cmake -DMPI_ENABLED=ON .. to enable.\n");
return EXIT_FAILURE;
}
#endif // AC_MPI_ENABLES

View File

@@ -0,0 +1,84 @@
#!/bin/bash
#defaults
default_num_procs=8
default_num_nodes=2
default_partition=gpu
num_procs=$default_num_procs
num_nodes=$default_num_nodes
partition=$default_partition
script_name=$0
print_usage(){
echo "Usage: $script_name [Options]"
echo " Runs ./mpi_reduce_bench, which will write benchmark results to a csv file"
echo " Remember to run this script from your build directory"
echo " The benchmarks are submitted with sbatch, unless the -i option is passed"
echo "Options:"
echo " -n <num_procs>"
echo " number of tasks for slurm, default=$default_num_procs"
echo " -p <partition>"
echo " which partition to use for slurm, default=$default_partition"
echo " -t <tag>"
echo " A benchmark tag that will be added to the mpi_reduction_benchmark.csv file"
echo " By default the current git HEAD short hash will be used as a tag"
echo " -i"
echo " Run the benchmark interactively with srun instead of sbatch"
echo " -h"
echo " Print this message"
}
while getopts :n:t:p:ih opt
do
case "$opt" in
n)
num_procs=$OPTARG
num_nodes=$(( 1 + ($num_procs - 1)/4))
;;
t)
benchmark_label=$OPTARG
;;
i)
interactively=1
;;
p)
partition=$OPTARG
;;
h)
print_usage
exit 0
;;
?)
print_usage
exit 1
esac
done
if [ -z "$benchmark_label" ]
then
benchmark_label=$(git rev-parse --short HEAD)
fi
set -x
exit 0
if [ -z "$interactively"]
then
sbatch <<EOF
#!/bin/sh
#BATCH --job-name=astaroth
#SBATCH --account=project_2000403
#SBATCH --time=00:14:59
#SBATCH --mem=48000
#SBATCH --partition=${partition}
#SBATCH --gres=gpu:v100:4
#SBATCH -n ${num_procs}
#SBATCH -N ${num_nodes}
srun ./mpi_reduce_bench ${benchmark_label}
EOF
else
srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p ${partition} -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}
fi

View File

@@ -25,6 +25,7 @@
#if AC_MPI_ENABLED
#include <mpi.h>
#include <vector>
int
main(void)
@@ -48,24 +49,65 @@ main(void)
// GPU alloc & compute
acGridInit(info);
acGridLoadMesh(model, STREAM_DEFAULT);
// INTEGRATION TESTS START ---------------------------------------------------------------------
acGridLoadMesh(model, STREAM_DEFAULT);
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
acGridPeriodicBoundconds(STREAM_DEFAULT);
acGridStoreMesh(STREAM_DEFAULT, &candidate);
acGridQuit();
// Verify
if (pid == 0) {
acModelIntegrateStep(model, FLT_EPSILON);
acMeshApplyPeriodicBounds(&model);
acVerifyMesh(model, candidate);
}
// INTEGRATION TESTS END -----------------------------------------------------------------------
// REDUCTION TESTS START -----------------------------------------------------------------------
acGridLoadMesh(model, STREAM_DEFAULT);
std::vector<AcScalReductionTestCase> scalarReductionTests{
acCreateScalReductionTestCase("Scalar MAX", VTXBUF_UUX, RTYPE_MAX),
acCreateScalReductionTestCase("Scalar MIN", VTXBUF_UUX, RTYPE_MIN),
/*
acCreateScalReductionTestCase("Scalar RMS", VTXBUF_UUX, RTYPE_RMS),
acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
acCreateScalReductionTestCase("Scalar SUM", VTXBUF_UUX, RTYPE_SUM),
*/
};
std::vector<AcVecReductionTestCase> vectorReductionTests{
acCreateVecReductionTestCase("Vector MAX", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
acCreateVecReductionTestCase("Vector MIN", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
/*
acCreateVecReductionTestCase("Vector RMS", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
RTYPE_RMS_EXP),
acCreateVecReductionTestCase("Vector SUM", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM),
*/
};
// False positives due to too strict error bounds, skip the tests until we can determine a
// proper error bound
fprintf(stderr, "WARNING: RTYPE_RMS, RTYPE_RMS_EXP, and RTYPE_SUM tests skipped\n");
for (auto& testCase : scalarReductionTests) {
acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
}
for (auto& testCase : vectorReductionTests) {
acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c,
&testCase.candidate);
}
if (pid == 0) {
acVerifyScalReductions(model, scalarReductionTests.data(), scalarReductionTests.size());
acVerifyVecReductions(model, vectorReductionTests.data(), vectorReductionTests.size());
}
// REDUCTION TESTS END -------------------------------------------------------------------------
if (pid == 0) {
acMeshDestroy(&model);
acMeshDestroy(&candidate);
}
acGridQuit();
MPI_Finalize();
return EXIT_SUCCESS;
}
@@ -74,7 +116,8 @@ main(void)
int
main(void)
{
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with cmake -DMPI_ENABLED=ON .. to enable.\n");
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
"cmake -DMPI_ENABLED=ON .. to enable.\n");
return EXIT_FAILURE;
}
#endif // AC_MPI_ENABLES

View File

@@ -105,7 +105,8 @@ operator+(const int3& a, const int3& b)
return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
}
static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
static HOST_DEVICE_INLINE int3
operator*(const int3& a, const int3& b)
{
return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
}
@@ -144,12 +145,20 @@ operator-=(AcReal3& lhs, const AcReal3& rhs)
lhs.z -= rhs.z;
}
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
static HOST_DEVICE_INLINE int3
operator*(const int& a, const int3& b)
{
return (int3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal3
operator*(const AcReal& a, const AcReal3& b)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
static HOST_DEVICE_INLINE AcReal3
operator*(const AcReal3& b, const AcReal& a)
{
return (AcReal3){a * b.x, a * b.y, a * b.z};
}

View File

@@ -1,10 +1,14 @@
find_package(CUDAToolkit)
## Astaroth Core
add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc)
target_link_libraries(astaroth_core astaroth_kernels CUDA::cudart CUDA::cuda_driver)
## Options
if (MPI_ENABLED)
find_package(MPI)
#find_package(MPI REQUIRED)
#find_package(OpenMP)
#target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
target_link_libraries(astaroth_core MPI::MPI_CXX)
endif()

View File

@@ -158,3 +158,65 @@ acGetNode(void)
ERRCHK_ALWAYS(num_nodes > 0);
return nodes[0];
}
AcResult
acUpdateBuiltinParams(AcMeshInfo* config)
{
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
///////////// PAD TEST
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
///////////// PAD TEST
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
// These do not have to be defined by empty projects any more.
// These should be set only if stdderiv.h is included
#ifdef AC_dsx
config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
#endif
#ifdef AC_dsy
config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
#endif
#ifdef AC_dsz
config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
#endif
/* Additional helper params */
// Int helpers
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
return AC_SUCCESS;
}
AcResult
acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
{
mesh->info = info;
const size_t bytes = acVertexBufferSizeBytes(mesh->info);
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
mesh->vertex_buffer[w] = (AcReal*)malloc(bytes);
ERRCHK_ALWAYS(mesh->vertex_buffer[w]);
}
return AC_SUCCESS;
}
AcResult
acMeshDestroy(AcMesh* mesh)
{
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
free(mesh->vertex_buffer[w]);
return AC_SUCCESS;
}

View File

@@ -0,0 +1,119 @@
#include "astaroth_fortran.h"
#include "astaroth.h"
#include "errchk.h"
/**
* Utils
*/
void
acupdatebuiltinparams_(AcMeshInfo* info)
{
acUpdateBuiltinParams(info);
}
void
acgetdevicecount_(int* count)
{
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(count));
}
/**
* Device
*/
void
acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle)
{
acDeviceCreate(*id, *info, handle);
}
void
acdevicedestroy_(Device* device)
{
acDeviceDestroy(*device);
}
void
acdeviceprintinfo_(const Device* device)
{
acDevicePrintInfo(*device);
}
void
acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info)
{
acDeviceLoadMeshInfo(*device, *info);
}
void
acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
const int* num_farrays, AcReal* farray)
{
ERRCHK_ALWAYS(*num_farrays >= NUM_VTXBUF_HANDLES);
WARNCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
AcMesh mesh;
mesh.info = *info;
for (int i = 0; i < *num_farrays; ++i)
mesh.vertex_buffer[i] = &farray[i * mxyz];
acDeviceLoadMesh(*device, *stream, mesh);
}
void
acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
const int* num_farrays, AcReal* farray)
{
ERRCHK_ALWAYS(*num_farrays >= NUM_VTXBUF_HANDLES);
WARNCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
AcMesh mesh;
mesh.info = *info;
const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
for (int i = 0; i < *num_farrays; ++i)
mesh.vertex_buffer[i] = &farray[i * mxyz];
acDeviceStoreMesh(*device, *stream, &mesh);
}
void
acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
const int3* start, const int3* end, const AcReal* dt)
{
acDeviceIntegrateSubstep(*device, *stream, *step_number, *start, *end, *dt);
}
void
acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
const int3* end)
{
acDevicePeriodicBoundconds(*device, *stream, *start, *end);
}
void
acdeviceswapbuffers_(const Device* device)
{
acDeviceSwapBuffers(*device);
}
void
acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
const VertexBufferHandle* vtxbuf_handle, AcReal* result)
{
acDeviceReduceScal(*device, *stream, *rtype, *vtxbuf_handle, result);
}
void
acdevicereducevec_(const Device* device, const Stream* stream, const ReductionType* rtype,
const VertexBufferHandle* vtxbuf0, const VertexBufferHandle* vtxbuf1,
const VertexBufferHandle* vtxbuf2, AcReal* result)
{
acDeviceReduceVec(*device, *stream, *rtype, *vtxbuf0, *vtxbuf1, *vtxbuf2, result);
}
void
acdevicesynchronizestream_(const Device* device, const Stream* stream)
{
acDeviceSynchronizeStream(*device, *stream);
}

View File

@@ -0,0 +1,50 @@
#pragma once
#include "astaroth.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* Utils
*/
void acupdatebuiltinparams_(AcMeshInfo* info);
void acgetdevicecount_(int* count);
/**
* Device
*/
void acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle);
void acdevicedestroy_(Device* device);
void acdeviceprintinfo_(const Device* device);
void acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info);
void acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
const int* num_farrays, AcReal* farray);
void acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
const int* num_farrays, AcReal* farray);
void acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
const int3* start, const int3* end, const AcReal* dt);
void acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
const int3* end);
void acdeviceswapbuffers_(const Device* device);
void acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
const VertexBufferHandle* vtxbuf_handle, AcReal* result);
void acdevicereducevec_(const Device* device, const Stream* stream, const ReductionType* rtype,
const VertexBufferHandle* vtxbuf0, const VertexBufferHandle* vtxbuf1,
const VertexBufferHandle* vtxbuf2, AcReal* result);
void acdevicesynchronizestream_(const Device* device, const Stream* stream);
#ifdef __cplusplus
} // extern "C"
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -44,7 +44,8 @@ rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
return (AcReal3){
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt),
};
}
#define rk3(state_previous, state_current, rate_of_change, dt) \
@@ -192,9 +193,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
}
}
#if VERBOSE_PRINTING
printf(
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f "
"ms\n",
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
#endif
/*
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
@@ -204,6 +205,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
*/
rk3_tpb = best_dims;
// Failed to find valid thread block dimensions
ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0);
return AC_SUCCESS;
}
@@ -211,6 +215,8 @@ AcResult
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
const int3 end, VertexBufferArray vba)
{
ERRCHK_ALWAYS(step_number >= 0);
ERRCHK_ALWAYS(step_number < 3);
const dim3 tpb = rk3_tpb;
const int3 n = end - start;

View File

@@ -75,17 +75,20 @@ exp(const acComplex& val)
{
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
}
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
static __device__ inline acComplex
operator*(const AcReal& a, const acComplex& b)
{
return (acComplex){a * b.x, a * b.y};
}
static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
static __device__ inline acComplex
operator*(const acComplex& b, const AcReal& a)
{
return (acComplex){a * b.x, a * b.y};
}
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
static __device__ inline acComplex
operator*(const acComplex& a, const acComplex& b)
{
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
}
@@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
@@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal
const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
@@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
@@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
}
if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
fprintf(
stderr,
"WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n",
value.x, value.y, value.z, int3param_names[param]);
fprintf(stderr,
"WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. "
"Skipping.\n",
value.x, value.y, value.z, int3param_names[param]);
return AC_FAILURE;
}
@@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device)
{
cudaSetDevice(device->id);
// clang-format off
// clang-format off
// Scalar
#define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE);
AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM)

View File

@@ -1,9 +1,19 @@
#pragma once
#include "astaroth.h"
#if AC_MPI_ENABLED
#include <mpi.h>
#include <stdbool.h>
#define MPI_GPUDIRECT_DISABLED (0)
#endif // AC_MPI_ENABLED
typedef struct {
int3 dims;
AcReal* data;
AcReal* data_pinned;
bool pinned = false; // Set if data was received to pinned memory
} PackedData;
typedef struct {

View File

@@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)],
src1[IDX(src_idx)],
src2[IDX(src_idx)]);
}
template <ReduceFunc reduce>

View File

@@ -1,3 +1,3 @@
## Astaroth Utils
add_library(astaroth_utils STATIC config_loader.c memory.c verification.c modelsolver.c)
add_library(astaroth_utils STATIC config_loader.c memory.c verification.c modelsolver.c modelreduce.c)
add_dependencies(astaroth_utils dsl_headers)

View File

@@ -74,42 +74,6 @@ parse_config(const char* path, AcMeshInfo* config)
fclose(fp);
}
AcResult
acUpdateBuiltinParams(AcMeshInfo* config)
{
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
///////////// PAD TEST
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
///////////// PAD TEST
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
/*
// DEPRECATED: Spacing TODO
// These do not have to be defined by empty projects any more.
// These should be set only if stdderiv.h is included
config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
*/
/* Additional helper params */
// Int helpers
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
return AC_SUCCESS;
}
/**
\brief Loads data from astaroth.conf into a config struct.
\return AC_SUCCESS on success, AC_FAILURE if there are potentially uninitialized values.

View File

@@ -20,29 +20,6 @@
#include "errchk.h"
AcResult
acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
{
mesh->info = info;
const size_t bytes = acVertexBufferSizeBytes(mesh->info);
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
mesh->vertex_buffer[w] = malloc(bytes);
ERRCHK_ALWAYS(mesh->vertex_buffer[w]);
}
return AC_SUCCESS;
}
AcResult
acMeshDestroy(AcMesh* mesh)
{
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
free(mesh->vertex_buffer[w]);
return AC_SUCCESS;
}
AcResult
acMeshSet(const AcReal value, AcMesh* mesh)
{

209
src/utils/modelreduce.c Normal file
View File

@@ -0,0 +1,209 @@
/*
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#include "astaroth.h"
#include <math.h>
#include "errchk.h"
#if AC_DOUBLE_PRECISION == 0 // HACK TODO fix, make cleaner (purkkaratkaisu)
#define fabs fabsf
#define exp expf
#define sqrt sqrtf
#endif
// Function pointer definitions
typedef AcReal (*ReduceFunc)(const AcReal, const AcReal);
typedef AcReal (*ReduceInitialScalFunc)(const AcReal);
typedef AcReal (*ReduceInitialVecFunc)(const AcReal, const AcReal,
const AcReal);
// clang-format off
/* Comparison funcs */
static inline AcReal
max(const AcReal a, const AcReal b) { return a > b ? a : b; }
static inline AcReal
min(const AcReal a, const AcReal b) { return a < b ? a : b; }
static inline AcReal
sum(const AcReal a, const AcReal b) { return a + b; }
/* Function used to determine the values used during reduction */
static inline AcReal
length_scal(const AcReal a) { return (AcReal)(a); }
static inline AcReal
length_vec(const AcReal a, const AcReal b, const AcReal c) { return sqrt(a*a + b*b + c*c); }
static inline AcReal
squared_scal(const AcReal a) { return (AcReal)(a*a); }
static inline AcReal
squared_vec(const AcReal a, const AcReal b, const AcReal c) { return squared_scal(a) + squared_scal(b) + squared_scal(c); }
static inline AcReal
exp_squared_scal(const AcReal a) { return exp(a)*exp(a); }
static inline AcReal
exp_squared_vec(const AcReal a, const AcReal b, const AcReal c) { return exp_squared_scal(a) + exp_squared_scal(b) + exp_squared_scal(c); }
// clang-format on
AcReal
acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a)
{
ReduceInitialScalFunc reduce_initial;
ReduceFunc reduce;
bool solve_mean = false;
switch (rtype) {
case RTYPE_MAX:
reduce_initial = length_scal;
reduce = max;
break;
case RTYPE_MIN:
reduce_initial = length_scal;
reduce = min;
break;
case RTYPE_RMS:
reduce_initial = squared_scal;
reduce = sum;
solve_mean = true;
break;
case RTYPE_RMS_EXP:
reduce_initial = exp_squared_scal;
reduce = sum;
solve_mean = true;
break;
case RTYPE_SUM:
reduce_initial = length_scal;
reduce = sum;
break;
default:
ERROR("Unrecognized RTYPE");
}
const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
mesh.info.int_params[AC_ny_min],
mesh.info.int_params[AC_nz_min], mesh.info);
AcReal res;
if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
res = reduce_initial(mesh.vertex_buffer[a][initial_idx]);
else
res = 0;
for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; ++k) {
for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; ++j) {
for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
++i) {
const int idx = acVertexBufferIdx(i, j, k, mesh.info);
const AcReal curr_val = reduce_initial(mesh.vertex_buffer[a][idx]);
res = reduce(res, curr_val);
}
}
}
if (solve_mean) {
const AcReal inv_n = (AcReal)1.0 / mesh.info.int_params[AC_nxyz];
return sqrt(inv_n * res);
}
else {
return res;
}
}
AcReal
acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
const VertexBufferHandle b, const VertexBufferHandle c)
{
// AcReal (*reduce_initial)(AcReal, AcReal, AcReal);
ReduceInitialVecFunc reduce_initial;
ReduceFunc reduce;
bool solve_mean = false;
switch (rtype) {
case RTYPE_MAX:
reduce_initial = length_vec;
reduce = max;
break;
case RTYPE_MIN:
reduce_initial = length_vec;
reduce = min;
break;
case RTYPE_RMS:
reduce_initial = squared_vec;
reduce = sum;
solve_mean = true;
break;
case RTYPE_RMS_EXP:
reduce_initial = exp_squared_vec;
reduce = sum;
solve_mean = true;
break;
case RTYPE_SUM:
reduce_initial = length_vec;
reduce = sum;
break;
default:
ERROR("Unrecognized RTYPE");
}
const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
mesh.info.int_params[AC_ny_min],
mesh.info.int_params[AC_nz_min], mesh.info);
AcReal res;
if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
res = reduce_initial(mesh.vertex_buffer[a][initial_idx], mesh.vertex_buffer[b][initial_idx],
mesh.vertex_buffer[c][initial_idx]);
else
res = 0;
for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; k++) {
for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; j++) {
for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
i++) {
const int idx = acVertexBufferIdx(i, j, k, mesh.info);
const AcReal curr_val = reduce_initial(mesh.vertex_buffer[a][idx],
mesh.vertex_buffer[b][idx],
mesh.vertex_buffer[c][idx]);
res = reduce(res, curr_val);
}
}
}
if (solve_mean) {
const AcReal inv_n = (AcReal)1.0 / mesh.info.int_params[AC_nxyz];
return sqrt(inv_n * res);
}
else {
return res;
}
}

View File

@@ -38,9 +38,10 @@
#define LMAGNETIC (1)
#define LENTROPY (1)
#define LTEMPERATURE (0)
#define LFORCING (0)
#define LFORCING (1)
#define LUPWD (1)
#define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
#define R_PI ((Scalar)M_PI)
typedef AcReal Scalar;
// typedef AcReal3 Vector;
@@ -54,6 +55,8 @@ typedef float Vector __attribute__((vector_size(4 * sizeof(float))));
#define fabs fabsf
#define exp expf
#define sqrt sqrtf
#define cos cosf
#define sin sinf
#endif
typedef struct {
@@ -103,11 +106,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds)
#elif STENCIL_ORDER == 4
const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
#elif STENCIL_ORDER == 6
const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0),
(Scalar)(1.0 / 60.0)};
const Scalar coefficients[] = {
0,
(Scalar)(3.0 / 4.0),
(Scalar)(-3.0 / 20.0),
(Scalar)(1.0 / 60.0),
};
#elif STENCIL_ORDER == 8
const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0),
(Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)};
const Scalar coefficients[] = {
0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0),
};
#endif
#define MID (STENCIL_ORDER / 2)
@@ -126,15 +134,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds)
#if STENCIL_ORDER == 2
const Scalar coefficients[] = {-2, 1};
#elif STENCIL_ORDER == 4
const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0),
(Scalar)(-1.0 / 12.0)};
const Scalar coefficients[] = {
(Scalar)(-5.0 / 2.0),
(Scalar)(4.0 / 3.0),
(Scalar)(-1.0 / 12.0),
};
#elif STENCIL_ORDER == 6
const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0),
(Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)};
const Scalar coefficients[] = {
(Scalar)(-49.0 / 18.0),
(Scalar)(3.0 / 2.0),
(Scalar)(-3.0 / 20.0),
(Scalar)(1.0 / 90.0),
};
#elif STENCIL_ORDER == 8
const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),
(Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0),
(Scalar)(-1.0 / 560.0)};
const Scalar coefficients[] = {
(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), (Scalar)(-1.0 / 5.0),
(Scalar)(8.0 / 315.0), (Scalar)(-1.0 / 560.0),
};
#endif
#define MID (STENCIL_ORDER / 2)
@@ -156,16 +172,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in
const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
#elif STENCIL_ORDER == 4
const Scalar coefficients[] = {
0, (Scalar)(1.0 / 32.0),
(Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders
0,
(Scalar)(1.0 / 32.0),
(Scalar)(1.0 / 64.0),
}; // TODO correct coefficients, these are just placeholders
#elif STENCIL_ORDER == 6
const Scalar fac = ((Scalar)(1. / 720.));
const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac,
(Scalar)(2.0) * fac};
const Scalar coefficients[] = {
0 * fac,
(Scalar)(270.0) * fac,
(Scalar)(-27.0) * fac,
(Scalar)(2.0) * fac,
};
#elif STENCIL_ORDER == 8
const Scalar fac = ((Scalar)(1. / 20160.));
const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac,
(Scalar)(128.) * fac, (Scalar)(-9.) * fac};
const Scalar coefficients[] = {
0 * fac,
(Scalar)(8064.) * fac,
(Scalar)(-1008.) * fac,
(Scalar)(128.) * fac,
(Scalar)(-9.) * fac,
};
#endif
#define MID (STENCIL_ORDER / 2)
@@ -207,14 +234,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr)
Scalar pencil_a[STENCIL_ORDER + 1];
//#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2,
k)];
pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
j + offset - STENCIL_ORDER / 2, k)];
Scalar pencil_b[STENCIL_ORDER + 1];
//#pragma unroll
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset,
k)];
pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
j + STENCIL_ORDER / 2 - offset, k)];
return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
}
@@ -539,7 +566,8 @@ gradient_of_divergence(const VectorData vec)
return (Vector){
hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]};
hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2],
};
}
// Takes uu gradients and returns S
@@ -777,9 +805,9 @@ Vector
helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
{
(void)magnitude; // WARNING: unused
xx[0] = xx[0] * (2.0 * M_PI / (getReal(AC_dsx) * getInt(AC_nx)));
xx[1] = xx[1] * (2.0 * M_PI / (getReal(AC_dsy) * getInt(AC_ny)));
xx[2] = xx[2] * (2.0 * M_PI / (getReal(AC_dsz) * getInt(AC_nz)));
xx[0] = xx[0] * ((Scalar)2.0 * R_PI / (getReal(AC_dsx) * getInt(AC_nx)));
xx[1] = xx[1] * ((Scalar)2.0 * R_PI / (getReal(AC_dsy) * getInt(AC_ny)));
xx[2] = xx[2] * ((Scalar)2.0 * R_PI / (getReal(AC_dsz) * getInt(AC_nz)));
Scalar cos_phi = cos(phi);
Scalar sin_phi = sin(phi);
@@ -805,10 +833,11 @@ forcing(int3 globalVertexIdx, Scalar dt)
getInt(AC_ny) * getReal(AC_dsy),
getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
(void)a; // WARNING: not used
Vector xx = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
(globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
(globalVertexIdx.z - getInt(AC_nz_min)) *
getReal(AC_dsz)}; // sink (current index)
Vector xx = (Vector){
(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
(globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
(globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz),
}; // sink (current index)
const Scalar cs2 = getReal(AC_cs2_sound);
const Scalar cs = sqrt(cs2);

View File

@@ -2,6 +2,7 @@
#include <math.h>
#include <stdbool.h>
#include <string.h>
#define max(a, b) ((a) > (b) ? (a) : (b))
#define min(a, b) ((a) < (b) ? (a) : (b))
@@ -18,25 +19,14 @@
#define WHT "\x1B[37m"
#define RESET "\x1B[0m"
typedef struct {
VertexBufferHandle handle;
AcReal model;
AcReal candidate;
long double abs_error;
long double ulp_error;
long double rel_error;
AcReal maximum_magnitude;
AcReal minimum_magnitude;
} Error;
static inline bool
is_valid(const AcReal a)
{
return !isnan(a) && !isinf(a);
}
static Error
get_error(AcReal model, AcReal candidate)
Error
acGetError(AcReal model, AcReal candidate)
{
Error error;
error.abs_error = 0;
@@ -109,13 +99,15 @@ get_max_abs_error(const VertexBufferHandle vtxbuf_handle, const AcMesh model_mes
for (size_t i = 0; i < acVertexBufferSize(model_mesh.info); ++i) {
Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
Error curr_error = acGetError(model_vtxbuf[i], candidate_vtxbuf[i]);
if (curr_error.abs_error > error.abs_error)
error = curr_error;
}
error.handle = vtxbuf_handle;
error.handle = vtxbuf_handle;
strncpy(error.label, vtxbuf_names[vtxbuf_handle], ERROR_LABEL_LENGTH - 1);
error.label[ERROR_LABEL_LENGTH - 1] = '\0';
error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
@@ -147,12 +139,12 @@ is_acceptable(const Error error)
return false;
}
static bool
print_error_to_screen(const Error error)
bool
printErrorToScreen(const Error error)
{
bool errors_found = false;
printf("\t%-15s... ", vtxbuf_names[error.handle]);
printf("\t%-15s... ", error.label);
if (is_acceptable(error)) {
printf(GRN "OK! " RESET);
}
@@ -177,14 +169,87 @@ acVerifyMesh(const AcMesh model, const AcMesh candidate)
bool errors_found = false;
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
Error field_error = get_max_abs_error(i, model, candidate);
errors_found |= print_error_to_screen(field_error);
errors_found |= printErrorToScreen(field_error);
}
printf("%s\n", errors_found ? "Failure. Found errors in one or more vertex buffers"
: "Success. No errors found.");
if (errors_found)
return AC_FAILURE;
else
return AC_SUCCESS;
return errors_found ? AC_FAILURE : AC_SUCCESS;
}
/** Verification function for scalar reductions*/
AcResult
acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases,
const size_t numCases)
{
printf("\nTesting scalar reductions:\n");
bool errors_found = false;
for (size_t i = 0; i < numCases; i++) {
AcReal model_reduction = acModelReduceScal(model, testCases[i].rtype, testCases[i].vtxbuf);
Error error = acGetError(model_reduction, testCases[i].candidate);
strncpy(error.label, testCases[i].label, ERROR_LABEL_LENGTH - 1);
error.label[ERROR_LABEL_LENGTH - 1] = '\0';
errors_found |= printErrorToScreen(error);
}
printf("%s\n", errors_found ? "Failure. Found errors in one or more scalar reductions"
: "Success. No errors found.");
return errors_found ? AC_FAILURE : AC_SUCCESS;
}
/** Verification function for vector reductions*/
AcResult
acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases,
const size_t numCases)
{
printf("\nTesting vector reductions:\n");
bool errors_found = false;
for (size_t i = 0; i < numCases; i++) {
AcReal model_reduction = acModelReduceVec(model, testCases[i].rtype, testCases[i].a,
testCases[i].b, testCases[i].c);
Error error = acGetError(model_reduction, testCases[i].candidate);
strncpy(error.label, testCases[i].label, ERROR_LABEL_LENGTH - 1);
error.label[ERROR_LABEL_LENGTH - 1] = '\0';
errors_found |= printErrorToScreen(error);
}
printf("%s\n", errors_found ? "Failure. Found errors in one or more vector reductions"
: "Success. No errors found.");
return errors_found ? AC_FAILURE : AC_SUCCESS;
}
/** Constructor for scalar reduction test case */
AcScalReductionTestCase
acCreateScalReductionTestCase(const char* label, const VertexBufferHandle vtxbuf, const ReductionType rtype)
{
AcScalReductionTestCase testCase;
strncpy(testCase.label,label,ERROR_LABEL_LENGTH - 1);
testCase.label[ERROR_LABEL_LENGTH - 1] = '\0';
testCase.vtxbuf = vtxbuf;
testCase.rtype = rtype;
testCase.candidate = 0;
return testCase;
}
/** Constructor for vector reduction test case */
AcVecReductionTestCase
acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a,
const VertexBufferHandle b, const VertexBufferHandle c, const ReductionType rtype)
{
AcVecReductionTestCase testCase;
strncpy(testCase.label,label,ERROR_LABEL_LENGTH - 1);
testCase.label[ERROR_LABEL_LENGTH - 1] = '\0';
testCase.a = a;
testCase.b = b;
testCase.c = c;
testCase.rtype = rtype;
testCase.candidate = 0;
return testCase;
}