I have inspected the branch and it complies and functions fine in my system. NOTE: I could not test MPI at the moment, but it should not prevent the merge.
Merged in mpi-to-master-merge-candidate-2020-06-01 (pull request #12) Astaroth 2.3
This commit is contained in:
@@ -1,5 +1,8 @@
|
||||
## CMake settings
|
||||
cmake_minimum_required(VERSION 3.9) # Required for first-class CUDA support
|
||||
# V3.9 required for first-class CUDA support
|
||||
# V3.17 required for the FindCUDAToolkit package
|
||||
# V3.18 required for CMAKE_CUDA_ARCHITECTURES
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
find_program(CMAKE_C_COMPILER NAMES $ENV{CC} gcc PATHS ENV PATH NO_DEFAULT_PATH)
|
||||
find_program(CMAKE_CXX_COMPILER NAMES $ENV{CXX} g++ PATHS ENV PATH NO_DEFAULT_PATH)
|
||||
|
||||
@@ -14,12 +17,19 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS}")
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
|
||||
find_package(CUDA) # Still required for various macros, such as cuda_select_nvcc_...
|
||||
cuda_select_nvcc_arch_flags(ARCHLIST Common) # Common architectures depend on the available CUDA version. Listed here: https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
|
||||
string(REPLACE ";" " " CUDA_ARCH_FLAGS "${ARCHLIST}")
|
||||
set(COMMON_FLAGS_CUDA "-mavx,-Wall,-Wextra,-Werror,-Wdouble-promotion,-Wfloat-conversion,-Wshadow")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options=${COMMON_FLAGS_CUDA}")
|
||||
|
||||
## CUDA
|
||||
# GPU, compute capability
|
||||
# K40, 3.5
|
||||
# K80, 3.7
|
||||
# P100, 6.0
|
||||
# V100, 7.0
|
||||
if (NOT CUDA_ARCHITECTURES)
|
||||
set(CMAKE_CUDA_ARCHITECTURES 60 70) # Default
|
||||
else ()
|
||||
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES}) # User-specified
|
||||
endif()
|
||||
string (REPLACE " " "," CUDA_COMMON_FLAGS "${COMMON_FLAGS}")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options=${CUDA_COMMON_FLAGS}")
|
||||
|
||||
## Build type
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
@@ -53,7 +63,8 @@ message(STATUS "AC module dir: ${DSL_MODULE_DIR}")
|
||||
file(GLOB DSL_SOURCES ${DSL_MODULE_DIR}/*
|
||||
${CMAKE_SOURCE_DIR}/acc/stdlib/*)
|
||||
set(DSL_HEADERS "${PROJECT_BINARY_DIR}/user_kernels.h"
|
||||
"${PROJECT_BINARY_DIR}/user_defines.h")
|
||||
"${PROJECT_BINARY_DIR}/user_defines.h"
|
||||
"${PROJECT_BINARY_DIR}/astaroth.f90")
|
||||
|
||||
add_custom_command (
|
||||
COMMENT "Building ACC objects ${DSL_MODULE_DIR}"
|
||||
@@ -79,6 +90,11 @@ include_directories(src/common) # Common headers
|
||||
include_directories(${CMAKE_BINARY_DIR}) # DSL headers
|
||||
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # CUDA headers
|
||||
|
||||
if (MPI_ENABLED)
|
||||
find_package(MPI REQUIRED)
|
||||
include_directories(${MPI_CXX_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
## Subdirectories
|
||||
add_subdirectory(src/utils)
|
||||
add_subdirectory(src/core/kernels)
|
||||
@@ -89,6 +105,10 @@ if (BUILD_SAMPLES)
|
||||
add_subdirectory(samples/cpptest)
|
||||
add_subdirectory(samples/mpitest)
|
||||
add_subdirectory(samples/benchmark)
|
||||
add_subdirectory(samples/bwtest)
|
||||
add_subdirectory(samples/genbenchmarkscripts)
|
||||
add_subdirectory(samples/mpi_reduce_bench)
|
||||
add_subdirectory(samples/fortrantest)
|
||||
endif()
|
||||
|
||||
if (BUILD_STANDALONE)
|
||||
|
@@ -40,6 +40,7 @@ In the base directory, run
|
||||
| Option | Description | Default |
|
||||
|--------|-------------|---------|
|
||||
| CMAKE_BUILD_TYPE | Selects the build type. Possible values: Debug, Release, RelWithDebInfo, MinSizeRel. See (CMake documentation)[https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html] for more details. | Release |
|
||||
| CUDA_ARCHITECTURES | Selects CUDA architecture support. Multiple architectures delimited by `;`. See (CMake documentation)[https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html] for more details. | "60;70" |
|
||||
| DOUBLE_PRECISION | Generates double precision code. | OFF |
|
||||
| BUILD_SAMPLES | Builds projects in samples subdirectory. | OFF |
|
||||
| BUILD_STANDALONE | Builds a standalone library for testing, benchmarking and simulation. | ON |
|
||||
@@ -67,6 +68,7 @@ See `analysis/python/` directory of existing data visualization and analysis scr
|
||||
|
||||
* `astaroth/include/astaroth.h`: Astaroth main header. Contains the interface for accessing single- and multi-GPU layers.
|
||||
* `astaroth/include/astaroth_utils.h`: Utility library header. Provides functions for performing common tasks on host, such as allocating and verifying meshes.
|
||||
* `<build directory>/astaroth.f90`: Fortran interface to Astaroth. Generated when building the library.
|
||||
|
||||
## FAQ
|
||||
|
||||
|
@@ -39,9 +39,11 @@ ASTNode* root = NULL;
|
||||
// Output files
|
||||
static FILE* DSLHEADER = NULL;
|
||||
static FILE* CUDAHEADER = NULL;
|
||||
static FILE* FHEADER = NULL;
|
||||
|
||||
static const char* dslheader_filename = "user_defines.h";
|
||||
static const char* cudaheader_filename = "user_kernels.h";
|
||||
static const char* fheader_filename = "astaroth.f90";
|
||||
|
||||
// Forward declaration of yyparse
|
||||
int yyparse(void);
|
||||
@@ -98,7 +100,8 @@ static const char* translation_table[TRANSLATION_TABLE_SIZE] = {
|
||||
['<'] = "<",
|
||||
['>'] = ">",
|
||||
['!'] = "!",
|
||||
['.'] = "."};
|
||||
['.'] = ".",
|
||||
};
|
||||
|
||||
static const char*
|
||||
translate(const int token)
|
||||
@@ -261,9 +264,8 @@ traverse(const ASTNode* node)
|
||||
if (typequal->token == KERNEL) {
|
||||
fprintf(CUDAHEADER, "GEN_KERNEL_PARAM_BOILERPLATE");
|
||||
if (node->lhs != NULL) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"Syntax error: function parameters for Kernel functions not allowed!\n");
|
||||
fprintf(stderr, "Syntax error: function parameters for Kernel functions not "
|
||||
"allowed!\n");
|
||||
}
|
||||
}
|
||||
else if (typequal->token == PREPROCESSED) {
|
||||
@@ -597,65 +599,173 @@ generate_preprocessed_structures(void)
|
||||
}
|
||||
|
||||
static void
|
||||
generate_header(void)
|
||||
generate_headers(void)
|
||||
{
|
||||
// Fortran interface
|
||||
const char* fortran_interface = R"(
|
||||
! -*-f90-*- (for emacs) vim:set filetype=fortran: (for vim)
|
||||
|
||||
! Utils (see astaroth_fortran.cc for definitions)
|
||||
external acupdatebuiltinparams
|
||||
external acgetdevicecount
|
||||
|
||||
! Device interface (see astaroth_fortran.cc for definitions)
|
||||
external acdevicecreate, acdevicedestroy
|
||||
external acdeviceprintinfo
|
||||
external acdeviceloadmeshinfo
|
||||
external acdeviceloadmesh, acdevicestoremesh
|
||||
external acdeviceintegratesubstep
|
||||
external acdeviceperiodicboundconds
|
||||
external acdeviceswapbuffers
|
||||
external acdevicereducescal, acdevicereducevec
|
||||
external acdevicesynchronizestream
|
||||
)";
|
||||
fprintf(FHEADER, "%s\n", fortran_interface);
|
||||
|
||||
fprintf(DSLHEADER, "#pragma once\n");
|
||||
|
||||
// Int params
|
||||
fprintf(DSLHEADER, "#define AC_FOR_USER_INT_PARAM_TYPES(FUNC)");
|
||||
int enumcounter = 0;
|
||||
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
|
||||
if (symbol_table[i].type_specifier == INT && symbol_table[i].type_qualifier == UNIFORM) {
|
||||
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
|
||||
enumcounter);
|
||||
++enumcounter;
|
||||
}
|
||||
}
|
||||
fprintf(DSLHEADER, "\n\n");
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_INT_PARAMS = %d\n\n", enumcounter);
|
||||
|
||||
// Int3 params
|
||||
fprintf(DSLHEADER, "#define AC_FOR_USER_INT3_PARAM_TYPES(FUNC)");
|
||||
enumcounter = 0;
|
||||
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
|
||||
if (symbol_table[i].type_specifier == INT3 && symbol_table[i].type_qualifier == UNIFORM) {
|
||||
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
|
||||
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
|
||||
enumcounter);
|
||||
++enumcounter;
|
||||
}
|
||||
}
|
||||
fprintf(DSLHEADER, "\n\n");
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_INT3_PARAMS = %d\n\n", enumcounter);
|
||||
|
||||
// Scalar params
|
||||
fprintf(DSLHEADER, "#define AC_FOR_USER_REAL_PARAM_TYPES(FUNC)");
|
||||
enumcounter = 0;
|
||||
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
|
||||
if (symbol_table[i].type_specifier == SCALAR && symbol_table[i].type_qualifier == UNIFORM) {
|
||||
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
|
||||
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
|
||||
enumcounter);
|
||||
++enumcounter;
|
||||
}
|
||||
}
|
||||
fprintf(DSLHEADER, "\n\n");
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_REAL_PARAMS = %d\n\n", enumcounter);
|
||||
|
||||
// Vector params
|
||||
fprintf(DSLHEADER, "#define AC_FOR_USER_REAL3_PARAM_TYPES(FUNC)");
|
||||
enumcounter = 0;
|
||||
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
|
||||
if (symbol_table[i].type_specifier == VECTOR && symbol_table[i].type_qualifier == UNIFORM) {
|
||||
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
|
||||
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
|
||||
enumcounter);
|
||||
++enumcounter;
|
||||
}
|
||||
}
|
||||
fprintf(DSLHEADER, "\n\n");
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_REAL3_PARAMS = %d\n\n", enumcounter);
|
||||
|
||||
// Scalar fields
|
||||
fprintf(DSLHEADER, "#define AC_FOR_VTXBUF_HANDLES(FUNC)");
|
||||
enumcounter = 0;
|
||||
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
|
||||
if (symbol_table[i].type_specifier == SCALARFIELD &&
|
||||
symbol_table[i].type_qualifier == UNIFORM) {
|
||||
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
|
||||
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
|
||||
enumcounter);
|
||||
++enumcounter;
|
||||
}
|
||||
}
|
||||
fprintf(DSLHEADER, "\n\n");
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_VTXBUF_HANDLES = %d\n\n", enumcounter);
|
||||
|
||||
// Scalar arrays
|
||||
fprintf(DSLHEADER, "#define AC_FOR_SCALARARRAY_HANDLES(FUNC)");
|
||||
enumcounter = 0;
|
||||
for (size_t i = 0; i < num_symbols[current_nest]; ++i) {
|
||||
if (symbol_table[i].type_specifier == SCALARARRAY &&
|
||||
symbol_table[i].type_qualifier == UNIFORM) {
|
||||
fprintf(DSLHEADER, "\\\nFUNC(%s)", symbol_table[i].identifier);
|
||||
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: %s = %d\n", symbol_table[i].identifier,
|
||||
enumcounter);
|
||||
++enumcounter;
|
||||
}
|
||||
}
|
||||
fprintf(DSLHEADER, "\n\n");
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: AC_NUM_SCALARRAY_HANDLES = %d\n\n", enumcounter);
|
||||
|
||||
// Streams
|
||||
const size_t nstreams = 20;
|
||||
for (size_t i = 0; i < nstreams; ++i) {
|
||||
fprintf(DSLHEADER, "#define STREAM_%lu (%lu)\n", i, i);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: STREAM_%lu = %lu\n", i, i);
|
||||
}
|
||||
fprintf(DSLHEADER, "#define NUM_STREAMS (%lu)\n", nstreams);
|
||||
fprintf(DSLHEADER, "#define STREAM_DEFAULT (STREAM_0)\n");
|
||||
fprintf(DSLHEADER, "#define STREAM_ALL (NUM_STREAMS)\n");
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: NUM_STREAMS = %lu\n", nstreams);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: STREAM_DEFAULT = STREAM_0\n");
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: STREAM_ALL = NUM_STREAMS\n");
|
||||
|
||||
fprintf(DSLHEADER, "typedef int Stream;\n\n");
|
||||
|
||||
// Reduction types
|
||||
size_t counter = 0;
|
||||
fprintf(DSLHEADER, "#define RTYPE_MAX (%lu)\n", counter);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MAX = %lu\n", counter);
|
||||
++counter;
|
||||
|
||||
fprintf(DSLHEADER, "#define RTYPE_MIN (%lu)\n", counter);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_MIN = %lu\n", counter);
|
||||
++counter;
|
||||
|
||||
fprintf(DSLHEADER, "#define RTYPE_RMS (%lu)\n", counter);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_RMS = %lu\n", counter);
|
||||
++counter;
|
||||
|
||||
fprintf(DSLHEADER, "#define RTYPE_RMS_EXP (%lu)\n", counter);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_RMS_EXP = %lu\n", counter);
|
||||
++counter;
|
||||
|
||||
fprintf(DSLHEADER, "#define RTYPE_SUM (%lu)\n", counter);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: RTYPE_SUM = %lu\n", counter);
|
||||
++counter;
|
||||
|
||||
fprintf(DSLHEADER, "typedef int ReductionType;\n");
|
||||
fprintf(DSLHEADER, "#define NUM_REDUCTION_TYPES (%lu)\n", counter);
|
||||
fprintf(FHEADER, "integer(c_int), parameter :: NUM_REDUCTION_TYPES = %lu\n", counter);
|
||||
|
||||
// Fortran structs
|
||||
const char* fortran_structs = R"(
|
||||
type, bind(C) :: AcMeshInfo
|
||||
integer(c_int), dimension(AC_NUM_INT_PARAMS) :: int_params
|
||||
integer(c_int), dimension(AC_NUM_INT3_PARAMS, 3) :: int3_params
|
||||
real, dimension(AC_NUM_REAL_PARAMS) :: real_params
|
||||
real, dimension(AC_NUM_REAL3_PARAMS, 3) :: real3_params
|
||||
end type AcMeshInfo
|
||||
)";
|
||||
fprintf(FHEADER, "%s\n", fortran_structs);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -681,20 +791,21 @@ main(void)
|
||||
|
||||
DSLHEADER = fopen(dslheader_filename, "w+");
|
||||
CUDAHEADER = fopen(cudaheader_filename, "w+");
|
||||
FHEADER = fopen(fheader_filename, "w+");
|
||||
assert(DSLHEADER);
|
||||
assert(CUDAHEADER);
|
||||
assert(FHEADER);
|
||||
|
||||
// Add built-in param symbols
|
||||
for (size_t i = 0; i < ARRAY_SIZE(builtin_int_params); ++i) {
|
||||
for (size_t i = 0; i < ARRAY_SIZE(builtin_int_params); ++i)
|
||||
add_symbol(SYMBOLTYPE_OTHER, UNIFORM, INT, builtin_int_params[i]);
|
||||
}
|
||||
for (size_t i = 0; i < ARRAY_SIZE(builtin_int3_params); ++i) {
|
||||
|
||||
for (size_t i = 0; i < ARRAY_SIZE(builtin_int3_params); ++i)
|
||||
add_symbol(SYMBOLTYPE_OTHER, UNIFORM, INT3, builtin_int3_params[i]);
|
||||
}
|
||||
|
||||
// Generate
|
||||
traverse(root);
|
||||
generate_header();
|
||||
generate_headers();
|
||||
generate_preprocessed_structures();
|
||||
generate_library_hooks();
|
||||
|
||||
@@ -703,9 +814,12 @@ main(void)
|
||||
// Cleanup
|
||||
fclose(DSLHEADER);
|
||||
fclose(CUDAHEADER);
|
||||
fclose(FHEADER);
|
||||
astnode_destroy(root);
|
||||
|
||||
fprintf(stdout, "-- Generated %s\n", dslheader_filename);
|
||||
fprintf(stdout, "-- Generated %s\n", cudaheader_filename);
|
||||
fprintf(stdout, "-- Generated %s\n", fheader_filename);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@@ -12,6 +12,8 @@ image: nvidia/cuda
|
||||
# ==> Updating the kernel drivers by ourselves probably requires creating our own docker image.
|
||||
# ===> Which might not even work since I don't know what kind of hardware we're running on (lspci was not available)
|
||||
|
||||
options:
|
||||
max-time: 5 # Max time allowed for building (minutes)
|
||||
pipelines:
|
||||
# default: # Default is run at every push but we have only 500 build minutes / month so that probably wouldn't work out
|
||||
custom: # Manual/scheduled building only
|
||||
@@ -20,7 +22,11 @@ pipelines:
|
||||
script: # Modify the commands below to build your repository.
|
||||
- mkdir -p build && cd build
|
||||
- apt-get update
|
||||
- apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev
|
||||
- apt-get install -y apt-transport-https ca-certificates gnupg software-properties-common wget
|
||||
- wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
|
||||
- apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
|
||||
- apt-get update
|
||||
- apt-get install -y cmake flex bison openmpi-bin libopenmpi-dev gfortran
|
||||
- cmake -DDSL_MODULE_DIR="acc/mhd_solver" -DBUILD_STANDALONE=ON -DBUILD_UTILS=ON -DBUILD_RT_VISUALIZATION=OFF -DBUILD_SAMPLES=ON -DDOUBLE_PRECISION=OFF -DMULTIGPU_ENABLED=ON -DMPI_ENABLED=OFF .. # Single precision
|
||||
- make -j
|
||||
- rm -rf *
|
||||
|
@@ -79,7 +79,7 @@ typedef enum {
|
||||
```
|
||||
|
||||
The API is divided into layers which differ in the level of control provided over the execution.
|
||||
There are two primary layers:
|
||||
There are three primary layers:
|
||||
|
||||
* Device layer
|
||||
* Functions start with acDevice.
|
||||
@@ -92,7 +92,13 @@ There are two primary layers:
|
||||
* All functions are asynchronous and executed concurrently on all devices in the node.
|
||||
* Subsequent functions called in the same stream (see Section #Streams and synchronization) are guaranteed to be synchronous.
|
||||
|
||||
Finally, a third layer is provided for convenience and backwards compatibility.
|
||||
* Grid layer
|
||||
* Functions start with acGrid.
|
||||
* Provides control over all devices on multiple node.
|
||||
* Requires MPI. `MPI_Init()` must be called before calling any acGrid functions.
|
||||
* Streams are used to control concurrency the same way as on acDevice and acNode layers.
|
||||
|
||||
Finally, a fourth layer is provided for convenience and backwards compatibility.
|
||||
|
||||
* Astaroth layer (deprecated)
|
||||
* Functions start with `ac` only, f.ex. acInit().
|
||||
@@ -126,6 +132,12 @@ AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* co
|
||||
AcResult acNodeAutoOptimize(const Node node);
|
||||
```
|
||||
|
||||
Grid layer.
|
||||
```C
|
||||
AcResult acGridInit(const AcMeshInfo info);
|
||||
AcResult acGridQuit(void);
|
||||
```
|
||||
|
||||
General helper functions.
|
||||
```C
|
||||
size_t acVertexBufferSize(const AcMeshInfo info);
|
||||
@@ -159,6 +171,7 @@ AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
|
||||
const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices);
|
||||
AcResult acGridLoadMesh(const AcMesh host_mesh, const Stream stream);
|
||||
```
|
||||
|
||||
Storing meshes and vertex buffer to host memory.
|
||||
@@ -180,6 +193,7 @@ AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices,
|
||||
AcMesh* host_mesh);
|
||||
AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
|
||||
```
|
||||
|
||||
Transferring data between devices
|
||||
@@ -242,6 +256,13 @@ AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionT
|
||||
AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result);
|
||||
AcResult acGridIntegrate(const Stream stream, const AcReal dt);
|
||||
AcResult acGridPeriodicBoundconds(const Stream stream);
|
||||
AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||
AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result);
|
||||
```
|
||||
|
||||
Finally, there's a library function that is automatically generated for all user-specified `Kernel`
|
||||
@@ -261,10 +282,13 @@ yet completed. Therefore special care must be taken in order to ensure proper sy
|
||||
|
||||
Synchronization is done using `Stream` primitives, defined as
|
||||
```C
|
||||
typedef enum { STREAM_DEFAULT, STREAM_0, ..., STREAM_16, NUM_STREAMS } Stream;
|
||||
typedef enum {STREAM_0, ..., STREAM_15, NUM_STREAMS};
|
||||
#define STREAM_DEFAULT (STREAM_0)
|
||||
#define STREAM_ALL (NUM_STREAMS)
|
||||
```
|
||||
|
||||
> **Note:** There is guaranteed to be at least 16 distinct streams.
|
||||
|
||||
Functions queued in the same stream will be executed sequentially. If two or more consequent
|
||||
functions are queued in different streams, then these functions may execute in parallel. For
|
||||
additional control over streams, there is a barrier synchronization function which blocks execution
|
||||
@@ -286,6 +310,7 @@ Astaroth API provides the following functions for barrier synchronization.
|
||||
AcResult acSynchronize(void);
|
||||
AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
|
||||
AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
|
||||
AcResult acGridSynchronizeStream(const Stream stream);
|
||||
```
|
||||
|
||||
## Data Synchronization
|
||||
@@ -408,13 +433,16 @@ int mz = info.int_params[AC_mz];
|
||||
after initialization.
|
||||
|
||||
|
||||
### Decomposition
|
||||
Grids and subgrids contain the dimensions of the the mesh decomposed to multiple devices.
|
||||
### Decomposition (`acNode` layer)
|
||||
|
||||
> **Note:** This section describes implementation details specific to the acNode layer. The acGrid layer is not related to the `GridDims` structure described here.
|
||||
|
||||
`GridDims` contains the dimensions of the the mesh decomposed to multiple devices.
|
||||
```C
|
||||
typedef struct {
|
||||
int3 m; // Size of the simulation domain (includes the ghost zones)
|
||||
int3 n; // Size of the computational domain (without ghost zones)
|
||||
} Grid;
|
||||
} GridDims;
|
||||
```
|
||||
|
||||
As briefly discussed in the section Data synchronization, a `Mesh` is distributed to multiple
|
||||
@@ -427,8 +455,6 @@ Let *i* be the device id. The portion of the halos shared by neighboring devices
|
||||
`acNodeSynchronizeVertexBuffer` and `acNodeSynchronizeMesh` communicate these shared areas among
|
||||
the devices in the node.
|
||||
|
||||
> **Note:** The decomposition scheme is subject to change.
|
||||
|
||||
# Astaroth Domain-Specific Language
|
||||
|
||||
We designed the Astaroth Domain-specific Language (DSL) for expressing stencil computations in a
|
||||
@@ -478,8 +504,8 @@ In addition to basic datatypes in C/C++/CUDA, such as int and int3, we provide t
|
||||
| Complex | A tuple of two 32- or 64-bit floating-point numbers. The real part is stored in member .x, while the imaginary component is in .y. Basic operations, such as multiplication, are defined as built-in functions. | std::complex<float> or std::complex<double> |
|
||||
| Matrix | A tuple of three Vectors. Is stored in column-major order, f.ex. Matrix[i][j] is the component on row i, column j. (TODO recheck specs.) | float3[3] or double3[3] |
|
||||
| ScalarArray | A one-dimensional array of Scalars stored in device memory. Given mesh dimensions (mx, my, mz), consists of max(mx, max(my, mz)) elements. | float[] or double[] |
|
||||
| ScalarField | An abstraction of a three-dimensional scalar field stored in device memory. Is implemented as a handle to a one-dimensional Scalar array consisting of input and output segments. The data is stored linearly in order i + j * mx + k * mx * my, given some vertex index (i, j, k) and mesh constisting of (mx, my, mz) vertices. | float[2][] or double[2][] |
|
||||
| VectorField | An abstraction of a three-dimensional vector field stored in device memory. Is implemented as a tuple of three ScalarField handles. | Three distinct float[2][] or double[2][] arrays for each component. Stored as a structure of arrays. |
|
||||
| ScalarField | A three-dimensional scalar field stored in row-wise scan order where coordinates `(i, j, k)` correspond to a one-dimensional index `i + j * mx + k * mx * my`. Consists of two buffers, one used for input and another one for output. | Two float[] or double[] arrays |
|
||||
| VectorField | A three-dimensional vector field. Consists of three `ScalarFields`. | Three `ScalarFields` stored contiguously in memory as a structure of arrays |
|
||||
|
||||
## Precision
|
||||
|
||||
|
2
doxyfile
2
doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME = "Astaroth"
|
||||
# could be handy for archiving the generated documentation or if some version
|
||||
# control system is used.
|
||||
|
||||
PROJECT_NUMBER = 2.2
|
||||
PROJECT_NUMBER = 2.3
|
||||
|
||||
# Using the PROJECT_BRIEF tag one can provide an optional one line description
|
||||
# for a project that appears at the top of each page and should give viewer a
|
||||
|
@@ -50,6 +50,8 @@ typedef struct {
|
||||
|
||||
typedef enum { AC_SUCCESS = 0, AC_FAILURE = 1 } AcResult;
|
||||
|
||||
/*
|
||||
// Deprecated, defined during code generation
|
||||
typedef enum {
|
||||
RTYPE_MAX,
|
||||
RTYPE_MIN,
|
||||
@@ -58,29 +60,7 @@ typedef enum {
|
||||
RTYPE_SUM,
|
||||
NUM_REDUCTION_TYPES
|
||||
} ReductionType;
|
||||
|
||||
typedef enum {
|
||||
STREAM_DEFAULT,
|
||||
STREAM_0,
|
||||
STREAM_1,
|
||||
STREAM_2,
|
||||
STREAM_3,
|
||||
STREAM_4,
|
||||
STREAM_5,
|
||||
STREAM_6,
|
||||
STREAM_7,
|
||||
STREAM_8,
|
||||
STREAM_9,
|
||||
STREAM_10,
|
||||
STREAM_11,
|
||||
STREAM_12,
|
||||
STREAM_13,
|
||||
STREAM_14,
|
||||
STREAM_15,
|
||||
STREAM_16,
|
||||
NUM_STREAMS
|
||||
} Stream;
|
||||
#define STREAM_ALL (NUM_STREAMS)
|
||||
*/
|
||||
|
||||
#define AC_GEN_ID(X) X,
|
||||
typedef enum {
|
||||
@@ -320,6 +300,15 @@ AcResult acGridIntegrate(const Stream stream, const AcReal dt);
|
||||
|
||||
/** */
|
||||
AcResult acGridPeriodicBoundconds(const Stream stream);
|
||||
|
||||
/** TODO */
|
||||
AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||
|
||||
/** TODO */
|
||||
AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result);
|
||||
#endif // AC_MPI_ENABLED
|
||||
|
||||
/*
|
||||
@@ -573,6 +562,20 @@ AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const
|
||||
/** */
|
||||
AcResult acDeviceRunMPITest(void);
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Helper functions
|
||||
* =============================================================================
|
||||
*/
|
||||
/** Updates the built-in parameters based on nx, ny and nz */
|
||||
AcResult acUpdateBuiltinParams(AcMeshInfo* config);
|
||||
|
||||
/** Creates a mesh stored in host memory */
|
||||
AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
|
||||
|
||||
/** Destroys a mesh stored in host memory */
|
||||
AcResult acMeshDestroy(AcMesh* mesh);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
@@ -29,17 +29,57 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdbool.h>
|
||||
|
||||
#define ERROR_LABEL_LENGTH 30
|
||||
|
||||
typedef struct {
|
||||
char label[ERROR_LABEL_LENGTH];
|
||||
VertexBufferHandle handle;
|
||||
AcReal model;
|
||||
AcReal candidate;
|
||||
long double abs_error;
|
||||
long double ulp_error;
|
||||
long double rel_error;
|
||||
AcReal maximum_magnitude;
|
||||
AcReal minimum_magnitude;
|
||||
} Error;
|
||||
|
||||
typedef struct {
|
||||
char label[ERROR_LABEL_LENGTH];
|
||||
VertexBufferHandle vtxbuf;
|
||||
ReductionType rtype;
|
||||
AcReal candidate;
|
||||
} AcScalReductionTestCase;
|
||||
|
||||
typedef struct {
|
||||
char label[ERROR_LABEL_LENGTH];
|
||||
VertexBufferHandle a;
|
||||
VertexBufferHandle b;
|
||||
VertexBufferHandle c;
|
||||
ReductionType rtype;
|
||||
AcReal candidate;
|
||||
} AcVecReductionTestCase;
|
||||
|
||||
/** TODO comment */
|
||||
Error acGetError(AcReal model, AcReal candidate);
|
||||
|
||||
/** TODO comment */
|
||||
bool printErrorToScreen(const Error error);
|
||||
|
||||
/** Loads data from the config file */
|
||||
AcResult acLoadConfig(const char* config_path, AcMeshInfo* config);
|
||||
|
||||
/** Updates the built-in parameters based on nx, ny and nz */
|
||||
AcResult acUpdateBuiltinParams(AcMeshInfo* config);
|
||||
/** */
|
||||
AcScalReductionTestCase acCreateScalReductionTestCase(const char* label,
|
||||
const VertexBufferHandle vtxbuf,
|
||||
const ReductionType rtype);
|
||||
|
||||
/** */
|
||||
AcResult acMeshCreate(const AcMeshInfo mesh_info, AcMesh* mesh);
|
||||
|
||||
/** */
|
||||
AcResult acMeshDestroy(AcMesh* mesh);
|
||||
AcVecReductionTestCase acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a,
|
||||
const VertexBufferHandle b,
|
||||
const VertexBufferHandle c,
|
||||
const ReductionType rtype);
|
||||
|
||||
/** */
|
||||
AcResult acMeshSet(const AcReal value, AcMesh* mesh);
|
||||
@@ -56,9 +96,24 @@ AcResult acMeshClear(AcMesh* mesh);
|
||||
/** */
|
||||
AcResult acModelIntegrateStep(AcMesh mesh, const AcReal dt);
|
||||
|
||||
/** TODO */
|
||||
AcReal acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a);
|
||||
|
||||
/** TODO */
|
||||
AcReal acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
|
||||
const VertexBufferHandle b, const VertexBufferHandle c);
|
||||
|
||||
/** */
|
||||
AcResult acVerifyMesh(const AcMesh model, const AcMesh candidate);
|
||||
|
||||
/** */
|
||||
AcResult acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases,
|
||||
const size_t numCases);
|
||||
|
||||
/** */
|
||||
AcResult acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases,
|
||||
const size_t numCases);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
@@ -39,8 +39,47 @@ typedef enum {
|
||||
NUM_TESTS,
|
||||
} TestType;
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y, z;
|
||||
} uint3_64;
|
||||
|
||||
static uint3_64
|
||||
operator+(const uint3_64& a, const uint3_64& b)
|
||||
{
|
||||
return (uint3_64){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||
}
|
||||
|
||||
static uint3_64
|
||||
morton3D(const uint64_t pid)
|
||||
{
|
||||
uint64_t i, j, k;
|
||||
i = j = k = 0;
|
||||
|
||||
for (int bit = 0; bit <= 21; ++bit) {
|
||||
const uint64_t mask = 0x1l << 3 * bit;
|
||||
k |= ((pid & (mask << 0)) >> 2 * bit) >> 0;
|
||||
j |= ((pid & (mask << 1)) >> 2 * bit) >> 1;
|
||||
i |= ((pid & (mask << 2)) >> 2 * bit) >> 2;
|
||||
}
|
||||
|
||||
return (uint3_64){i, j, k};
|
||||
}
|
||||
|
||||
static uint3_64
|
||||
decompose(const uint64_t target)
|
||||
{
|
||||
// This is just so beautifully elegant. Complex and efficient decomposition
|
||||
// in just one line of code.
|
||||
uint3_64 p = morton3D(target - 1) + (uint3_64){1, 1, 1};
|
||||
|
||||
ERRCHK_ALWAYS(p.x * p.y * p.z == target);
|
||||
return p;
|
||||
}
|
||||
|
||||
int
|
||||
main(void)
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
MPI_Init(NULL, NULL);
|
||||
int nprocs, pid;
|
||||
@@ -51,9 +90,30 @@ main(void)
|
||||
AcMeshInfo info;
|
||||
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
||||
|
||||
if (argc > 1) {
|
||||
if (argc == 4) {
|
||||
const int nx = atoi(argv[1]);
|
||||
const int ny = atoi(argv[2]);
|
||||
const int nz = atoi(argv[3]);
|
||||
info.int_params[AC_nx] = nx;
|
||||
info.int_params[AC_ny] = ny;
|
||||
info.int_params[AC_nz] = nz;
|
||||
acUpdateBuiltinParams(&info);
|
||||
printf("Updated mesh dimensions to (%d, %d, %d)\n", nx, ny, nz);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Could not parse arguments. Usage: ./benchmark <nx> <ny> <nz>.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
const TestType test = TEST_STRONG_SCALING;
|
||||
if (test == TEST_WEAK_SCALING)
|
||||
info.int_params[AC_nz] *= nprocs;
|
||||
if (test == TEST_WEAK_SCALING) {
|
||||
uint3_64 decomp = decompose(nprocs);
|
||||
info.int_params[AC_nx] *= decomp.x;
|
||||
info.int_params[AC_ny] *= decomp.y;
|
||||
info.int_params[AC_nz] *= decomp.z;
|
||||
}
|
||||
|
||||
/*
|
||||
AcMesh model, candidate;
|
||||
@@ -89,19 +149,45 @@ main(void)
|
||||
}
|
||||
}*/
|
||||
|
||||
/*
|
||||
// Basic
|
||||
const size_t num_iters = 100;
|
||||
|
||||
// Warmup
|
||||
for (size_t i = 0; i < 10; ++i)
|
||||
for (size_t i = 0; i < num_iters / 10; ++i)
|
||||
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
|
||||
|
||||
// Benchmark
|
||||
Timer t;
|
||||
const AcReal dt = FLT_EPSILON;
|
||||
const size_t num_iters = 100;
|
||||
const double nth_percentile = 0.90;
|
||||
const AcReal dt = FLT_EPSILON;
|
||||
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
timer_reset(&t);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
|
||||
for (size_t i = 0; i < num_iters; ++i)
|
||||
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
if (!pid)
|
||||
timer_diff_print(t);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
*/
|
||||
|
||||
// Percentiles
|
||||
const size_t num_iters = 1000;
|
||||
const double nth_percentile = 0.90;
|
||||
std::vector<double> results; // ms
|
||||
results.reserve(num_iters);
|
||||
|
||||
// Warmup
|
||||
for (size_t i = 0; i < num_iters / 10; ++i)
|
||||
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
|
||||
|
||||
// Benchmark
|
||||
Timer t;
|
||||
const AcReal dt = FLT_EPSILON;
|
||||
|
||||
for (size_t i = 0; i < num_iters; ++i) {
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
timer_reset(&t);
|
||||
@@ -109,9 +195,9 @@ main(void)
|
||||
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
results.push_back(timer_diff_nsec(t) / 1e6);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
}
|
||||
|
||||
// Write benchmark to file
|
||||
if (!pid) {
|
||||
std::sort(results.begin(), results.end(),
|
||||
[](const double& a, const double& b) { return a < b; });
|
||||
@@ -121,22 +207,58 @@ main(void)
|
||||
results[nth_percentile * num_iters], 100 * nth_percentile);
|
||||
|
||||
char path[4096] = "";
|
||||
if (test == TEST_STRONG_SCALING)
|
||||
strncpy(path, "strong_scaling.csv", sizeof(path));
|
||||
else if (test == TEST_WEAK_SCALING)
|
||||
strncpy(path, "weak_scaling.csv", sizeof(path));
|
||||
else
|
||||
ERROR("Invalid test type");
|
||||
sprintf(path, "%s_%d.csv", test == TEST_STRONG_SCALING ? "strong" : "weak", nprocs);
|
||||
|
||||
FILE* fp = fopen(path, "a");
|
||||
ERRCHK_ALWAYS(fp);
|
||||
// Format
|
||||
// nprocs, measured (ms)
|
||||
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
|
||||
|
||||
// nprocs, min, 50th perc, 90th perc, max
|
||||
fprintf(fp, "%d, %g, %g, %g, %g\n", nprocs, results[0], results[0.5 * num_iters], results[nth_percentile * num_iters], results[num_iters-1]);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
/*
|
||||
const size_t num_iters = 1000;
|
||||
const double nth_percentile = 0.90;
|
||||
|
||||
std::vector<double> results; // ms
|
||||
results.reserve(num_iters);
|
||||
|
||||
for (size_t i = 0; i < num_iters; ++i) {
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
timer_reset(&t);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
acGridIntegrate(STREAM_DEFAULT, dt);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
results.push_back(timer_diff_nsec(t) / 1e6);
|
||||
}
|
||||
|
||||
// Write benchmark to file
|
||||
if (!pid) {
|
||||
std::sort(results.begin(), results.end(),
|
||||
[](const double& a, const double& b) { return a < b; });
|
||||
fprintf(stdout,
|
||||
"Integration step time %g ms (%gth "
|
||||
"percentile)--------------------------------------\n",
|
||||
results[nth_percentile * num_iters], 100 * nth_percentile);
|
||||
|
||||
char path[4096] = "";
|
||||
if (test == TEST_STRONG_SCALING)
|
||||
strncpy(path, "strong_scaling.csv", sizeof(path));
|
||||
else if (test == TEST_WEAK_SCALING)
|
||||
strncpy(path, "weak_scaling.csv", sizeof(path));
|
||||
else
|
||||
ERROR("Invalid test type");
|
||||
|
||||
FILE* fp = fopen(path, "a");
|
||||
ERRCHK_ALWAYS(fp);
|
||||
// Format
|
||||
// nprocs, measured (ms)
|
||||
fprintf(fp, "%d, %g\n", nprocs, results[nth_percentile * num_iters]);
|
||||
|
||||
fclose(fp);
|
||||
}*/
|
||||
|
||||
acGridQuit();
|
||||
MPI_Finalize();
|
||||
return EXIT_SUCCESS;
|
||||
|
9
samples/bwtest/CMakeLists.txt
Normal file
9
samples/bwtest/CMakeLists.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
cmake_minimum_required(VERSION 3.17) # Required for moder CUDA::cudart linking
|
||||
|
||||
find_package(MPI)
|
||||
find_package(OpenMP)
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
add_executable(bwtest main.c)
|
||||
target_link_libraries(bwtest MPI::MPI_C OpenMP::OpenMP_C CUDA::cudart_static CUDA::cuda_driver)
|
||||
target_compile_options(bwtest PRIVATE -O3)
|
525
samples/bwtest/main.c
Normal file
525
samples/bwtest/main.c
Normal file
@@ -0,0 +1,525 @@
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include <cuda.h> // CUDA driver API
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include "timer_hires.h" // From src/common
|
||||
|
||||
//#define BLOCK_SIZE (100 * 1024 * 1024) // Bytes
|
||||
#define BLOCK_SIZE (256 * 256 * 3 * 8 * 8)
|
||||
|
||||
#define errchk(x) \
|
||||
{ \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "errchk(%s) failed", #x); \
|
||||
assert(x); \
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
Findings:
|
||||
- MUST ALWAYS SET DEVICE. Absolutely kills performance if device is not set explicitly
|
||||
- Need to use cudaMalloc for intranode comm for P2P to trigger with MPI
|
||||
- For internode one should use pinned memory (RDMA is staged through pinned, gives full
|
||||
network speed iff pinned)
|
||||
- Both the sending and receiving arrays must be pinned to see performance improvement
|
||||
in internode comm
|
||||
*/
|
||||
|
||||
static uint8_t*
|
||||
allocHost(const size_t bytes)
|
||||
{
|
||||
uint8_t* arr = malloc(bytes);
|
||||
errchk(arr);
|
||||
return arr;
|
||||
}
|
||||
|
||||
static void
|
||||
freeHost(uint8_t* arr)
|
||||
{
|
||||
free(arr);
|
||||
}
|
||||
|
||||
static uint8_t*
|
||||
allocDevice(const size_t bytes)
|
||||
{
|
||||
uint8_t* arr;
|
||||
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||
const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||
errchk(retval == cudaSuccess);
|
||||
return arr;
|
||||
}
|
||||
|
||||
static uint8_t*
|
||||
allocDevicePinned(const size_t bytes)
|
||||
{
|
||||
#define USE_CUDA_DRIVER_PINNING (1)
|
||||
#if USE_CUDA_DRIVER_PINNING
|
||||
uint8_t* arr = allocDevice(bytes);
|
||||
|
||||
unsigned int flag = 1;
|
||||
CUresult retval = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
|
||||
(CUdeviceptr)arr);
|
||||
|
||||
errchk(retval == CUDA_SUCCESS);
|
||||
return arr;
|
||||
|
||||
#else
|
||||
uint8_t* arr;
|
||||
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||
errchk(retval == cudaSuccess);
|
||||
return arr;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
static uint8_t*
|
||||
allocDevicePinned(const size_t bytes)
|
||||
{
|
||||
uint8_t* arr;
|
||||
// Standard (20 GiB/s internode, 85 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMalloc((void**)&arr, bytes);
|
||||
// Unified mem (5 GiB/s internode, 6 GiB/s intranode)
|
||||
// const cudaError_t retval = cudaMallocManaged((void**)&arr, bytes, cudaMemAttachGlobal);
|
||||
// Pinned (40 GiB/s internode, 10 GiB/s intranode)
|
||||
const cudaError_t retval = cudaMallocHost((void**)&arr, bytes);
|
||||
errchk(retval == cudaSuccess);
|
||||
return arr;
|
||||
}*/
|
||||
|
||||
static void
|
||||
freeDevice(uint8_t* arr)
|
||||
{
|
||||
cudaFree(arr);
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_blocking(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
int nfront = (pid + 1) % nprocs;
|
||||
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
if (!pid) {
|
||||
MPI_Status status;
|
||||
MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
|
||||
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
|
||||
}
|
||||
else {
|
||||
MPI_Status status;
|
||||
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &status);
|
||||
MPI_Send(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_nonblocking(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
int nfront = (pid + 1) % nprocs;
|
||||
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
MPI_Request recv_request, send_request;
|
||||
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, nback, MPI_COMM_WORLD, &recv_request);
|
||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, MPI_COMM_WORLD, &send_request);
|
||||
|
||||
MPI_Status status;
|
||||
MPI_Wait(&recv_request, &status);
|
||||
MPI_Wait(&send_request, &status);
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_twoway(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
int nfront = (pid + 1) % nprocs;
|
||||
int nback = (((pid - 1) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
MPI_Status status;
|
||||
MPI_Sendrecv(src, BLOCK_SIZE, MPI_BYTE, nfront, pid, dst, BLOCK_SIZE, MPI_BYTE, nback, nback,
|
||||
MPI_COMM_WORLD, &status);
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_nonblocking_multiple(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
MPI_Request recv_requests[nprocs], send_requests[nprocs];
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
int nfront = (pid + i) % nprocs;
|
||||
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
|
||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
|
||||
}
|
||||
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Wait(&recv_requests[i], &status);
|
||||
MPI_Wait(&send_requests[i], &status);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
static void
|
||||
sendrecv_nonblocking_multiple_parallel(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
MPI_Request send_requests[nprocs];
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
int nfront = (pid + i) % nprocs;
|
||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
|
||||
}
|
||||
|
||||
static bool error_shown = false;
|
||||
if (!pid && !error_shown) {
|
||||
fprintf(stderr, "\tWARNING: make sure you init MPI_Init_thread for OpenMP support (no "
|
||||
"supported on puhti atm "
|
||||
"2020-04-05\n");
|
||||
error_shown = true;
|
||||
}
|
||||
#pragma omp parallel for
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
MPI_Status status;
|
||||
MPI_Recv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &status);
|
||||
}
|
||||
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Wait(&send_requests[i], &status);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
static void
|
||||
sendrecv_nonblocking_multiple_rt_pinning(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
static uint8_t* src_pinned = NULL;
|
||||
static uint8_t* dst_pinned = NULL;
|
||||
if (!src_pinned)
|
||||
src_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
|
||||
if (!dst_pinned)
|
||||
dst_pinned = allocDevicePinned(BLOCK_SIZE); // Note: Never freed
|
||||
|
||||
int devices_per_node = -1;
|
||||
cudaGetDeviceCount(&devices_per_node);
|
||||
|
||||
MPI_Request recv_requests[nprocs], send_requests[nprocs];
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
int nfront = (pid + i) % nprocs;
|
||||
int nback = (((pid - i) % nprocs) + nprocs) % nprocs;
|
||||
|
||||
if (nback / devices_per_node != pid / devices_per_node) // Not on the same node
|
||||
MPI_Irecv(dst_pinned, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD,
|
||||
&recv_requests[i]);
|
||||
else
|
||||
MPI_Irecv(dst, BLOCK_SIZE, MPI_BYTE, nback, pid, MPI_COMM_WORLD, &recv_requests[i]);
|
||||
|
||||
if (nfront / devices_per_node != pid / devices_per_node) // Not on the same node
|
||||
MPI_Isend(src_pinned, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD,
|
||||
&send_requests[i]);
|
||||
else
|
||||
MPI_Isend(src, BLOCK_SIZE, MPI_BYTE, nfront, nfront, MPI_COMM_WORLD, &send_requests[i]);
|
||||
}
|
||||
|
||||
for (int i = 1; i < nprocs; ++i) {
|
||||
MPI_Status status;
|
||||
MPI_Wait(&recv_requests[i], &status);
|
||||
MPI_Wait(&send_requests[i], &status);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
send_d2h(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
static void
|
||||
send_h2d(uint8_t* src, uint8_t* dst)
|
||||
{
|
||||
cudaMemcpy(dst, src, BLOCK_SIZE, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
static void
|
||||
sendrecv_d2h2d(uint8_t* dsrc, uint8_t* hdst, uint8_t* hsrc, uint8_t* ddst)
|
||||
{
|
||||
cudaStream_t d2h, h2d;
|
||||
cudaStreamCreate(&d2h);
|
||||
cudaStreamCreate(&h2d);
|
||||
|
||||
cudaMemcpyAsync(hdst, dsrc, BLOCK_SIZE, cudaMemcpyDeviceToHost, d2h);
|
||||
cudaMemcpyAsync(ddst, hsrc, BLOCK_SIZE, cudaMemcpyHostToDevice, h2d);
|
||||
|
||||
cudaStreamSynchronize(d2h);
|
||||
cudaStreamSynchronize(h2d);
|
||||
|
||||
cudaStreamDestroy(d2h);
|
||||
cudaStreamDestroy(h2d);
|
||||
}
|
||||
|
||||
#define PRINT \
|
||||
if (!pid) \
|
||||
printf
|
||||
|
||||
static void
|
||||
measurebw(const char* msg, const size_t bytes, void (*sendrecv)(uint8_t*, uint8_t*), uint8_t* src,
|
||||
uint8_t* dst)
|
||||
{
|
||||
const size_t num_samples = 100;
|
||||
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
PRINT("%s\n", msg);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
PRINT("\tWarming up... ");
|
||||
for (size_t i = 0; i < num_samples / 10; ++i)
|
||||
sendrecv(src, dst);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
PRINT("Done\n");
|
||||
|
||||
PRINT("\tBandwidth... ");
|
||||
fflush(stdout);
|
||||
|
||||
Timer t;
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
timer_reset(&t);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
sendrecv(src, dst);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
|
||||
PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
|
||||
PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
static void
|
||||
measurebw2(const char* msg, const size_t bytes,
|
||||
void (*sendrecv)(uint8_t*, uint8_t*, uint8_t*, uint8_t*), uint8_t* dsrc, uint8_t* hdst,
|
||||
uint8_t* hsrc, uint8_t* ddst)
|
||||
{
|
||||
const size_t num_samples = 100;
|
||||
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
PRINT("%s\n", msg);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
PRINT("\tWarming up... ");
|
||||
for (size_t i = 0; i < num_samples / 10; ++i)
|
||||
sendrecv(dsrc, hdst, hsrc, ddst);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
PRINT("Done\n");
|
||||
|
||||
PRINT("\tBandwidth... ");
|
||||
fflush(stdout);
|
||||
|
||||
Timer t;
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
timer_reset(&t);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
for (size_t i = 0; i < num_samples; ++i)
|
||||
sendrecv(dsrc, hdst, hsrc, ddst);
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
const long double time_elapsed = timer_diff_nsec(t) / 1e9l; // seconds
|
||||
PRINT("%Lg GiB/s\n", num_samples * bytes / time_elapsed / (1024 * 1024 * 1024));
|
||||
PRINT("\tTransfer time: %Lg ms\n", time_elapsed * 1000 / num_samples);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
MPI_Init(NULL, NULL);
|
||||
// int provided;
|
||||
// MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
|
||||
// errchk(provided >= MPI_THREAD_MULTIPLE);
|
||||
|
||||
// Disable stdout buffering
|
||||
setbuf(stdout, NULL);
|
||||
|
||||
int pid, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
errchk(nprocs >= 2); // Require at least one neighbor
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
if (!pid) {
|
||||
printf("Do we have threads? The following should not be ordered (unless very lucky)\n");
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < 10; ++i)
|
||||
printf("%d, ", i);
|
||||
printf("\n");
|
||||
}
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
int devices_per_node = -1;
|
||||
cudaGetDeviceCount(&devices_per_node);
|
||||
const int device_id = pid % devices_per_node;
|
||||
cudaSetDevice(device_id);
|
||||
|
||||
printf("Process %d of %d running.\n", pid, nprocs);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
PRINT("Block size: %u MiB\n", BLOCK_SIZE / (1024 * 1024));
|
||||
|
||||
#if 1
|
||||
{
|
||||
uint8_t* src = allocHost(BLOCK_SIZE);
|
||||
uint8_t* dst = allocHost(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional bandwidth, blocking (Host)", //
|
||||
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async (Host)", //
|
||||
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, twoway (Host)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Host)", //
|
||||
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
// measurebw("Bidirectional bandwidth, async multiple parallel (Host)", //
|
||||
// 2 * (nprocs-1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
||||
|
||||
freeHost(src);
|
||||
freeHost(dst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
|
||||
{
|
||||
uint8_t* src = allocDevice(BLOCK_SIZE);
|
||||
uint8_t* dst = allocDevice(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional bandwidth, blocking (Device)", //
|
||||
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async (Device)", //
|
||||
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, twoway (Device)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device)", //
|
||||
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
// measurebw("Bidirectional bandwidth, async multiple parallel (Device)", //
|
||||
// 2 * (nprocs-1) *BLOCK_SIZE, sendrecv_nonblocking_multiple_parallel, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
|
||||
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src,
|
||||
dst);
|
||||
|
||||
freeDevice(src);
|
||||
freeDevice(dst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
|
||||
{
|
||||
uint8_t* src = allocDevicePinned(BLOCK_SIZE);
|
||||
uint8_t* dst = allocDevicePinned(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional bandwidth, blocking (Device, pinned)", //
|
||||
2 * BLOCK_SIZE, sendrecv_blocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async (Device, pinned)", //
|
||||
2 * BLOCK_SIZE, sendrecv_nonblocking, src, dst);
|
||||
measurebw("Bidirectional bandwidth, twoway (Device, pinned)", //
|
||||
2 * BLOCK_SIZE, sendrecv_twoway, src, dst);
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device, pinned)", //
|
||||
2 * (nprocs - 1) * BLOCK_SIZE, sendrecv_nonblocking_multiple, src, dst);
|
||||
|
||||
freeDevice(src);
|
||||
freeDevice(dst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
|
||||
{
|
||||
uint8_t* hsrc = allocHost(BLOCK_SIZE);
|
||||
uint8_t* hdst = allocHost(BLOCK_SIZE);
|
||||
uint8_t* dsrc = allocDevice(BLOCK_SIZE);
|
||||
uint8_t* ddst = allocDevice(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional D2H", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
||||
measurebw("Unidirectional H2D", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
||||
|
||||
measurebw2("Bidirectional D2H & H2D", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst, hsrc,
|
||||
ddst);
|
||||
|
||||
freeDevice(dsrc);
|
||||
freeDevice(ddst);
|
||||
freeHost(hsrc);
|
||||
freeHost(hdst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
|
||||
{
|
||||
uint8_t* hsrc = allocHost(BLOCK_SIZE);
|
||||
uint8_t* hdst = allocHost(BLOCK_SIZE);
|
||||
uint8_t* dsrc = allocDevicePinned(BLOCK_SIZE);
|
||||
uint8_t* ddst = allocDevicePinned(BLOCK_SIZE);
|
||||
|
||||
measurebw("Unidirectional D2H (pinned)", BLOCK_SIZE, send_d2h, dsrc, hdst);
|
||||
measurebw("Unidirectional H2D (pinned)", BLOCK_SIZE, send_h2d, hsrc, ddst);
|
||||
|
||||
measurebw2("Bidirectional D2H & H2D (pinned)", 2 * BLOCK_SIZE, sendrecv_d2h2d, dsrc, hdst,
|
||||
hsrc, ddst);
|
||||
|
||||
freeDevice(dsrc);
|
||||
freeDevice(ddst);
|
||||
freeHost(hsrc);
|
||||
freeHost(hdst);
|
||||
}
|
||||
PRINT("\n------------------------\n");
|
||||
#else
|
||||
{ // Final run for easy identification with the profiler
|
||||
uint8_t* src = allocDevice(BLOCK_SIZE);
|
||||
uint8_t* dst = allocDevice(BLOCK_SIZE);
|
||||
|
||||
measurebw("Bidirectional bandwidth, async multiple (Device, rt pinning)", //
|
||||
2 * BLOCK_SIZE, sendrecv_nonblocking_multiple_rt_pinning, src, dst);
|
||||
|
||||
freeDevice(src);
|
||||
freeDevice(dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
MPI_Finalize();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
4
samples/fortrantest/CMakeLists.txt
Normal file
4
samples/fortrantest/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
enable_language(Fortran)
|
||||
|
||||
add_executable(fortrantest main.f90)
|
||||
target_link_libraries(fortrantest astaroth_core)
|
23
samples/fortrantest/main.f90
Normal file
23
samples/fortrantest/main.f90
Normal file
@@ -0,0 +1,23 @@
|
||||
program pc
|
||||
use, intrinsic :: iso_c_binding
|
||||
implicit none
|
||||
|
||||
include "astaroth.f90"
|
||||
|
||||
type(AcMeshInfo) :: info
|
||||
type(c_ptr) :: device
|
||||
|
||||
print *, "Num int params"
|
||||
print *, AC_NUM_INT_PARAMS
|
||||
|
||||
! Setup config
|
||||
info%int_params(AC_nx + 1) = 128
|
||||
info%int_params(AC_ny + 1) = 128
|
||||
info%int_params(AC_nz + 1) = 128
|
||||
call acupdatebuiltinparams(info)
|
||||
|
||||
call acdevicecreate(0, info, device)
|
||||
call acdeviceprintinfo(device)
|
||||
call acdevicedestroy(device)
|
||||
|
||||
end program
|
8
samples/genbenchmarkscripts/CMakeLists.txt
Normal file
8
samples/genbenchmarkscripts/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
add_executable(genbenchmarkscripts main.c)
|
||||
|
||||
add_custom_command(
|
||||
TARGET genbenchmarkscripts POST_BUILD
|
||||
COMMAND genbenchmarkscripts
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
|
||||
COMMENT "Generating benchmark scripts"
|
||||
)
|
62
samples/genbenchmarkscripts/main.c
Normal file
62
samples/genbenchmarkscripts/main.c
Normal file
@@ -0,0 +1,62 @@
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
const int max_nprocs = 128;
|
||||
for (int nprocs = 1; nprocs <= max_nprocs; nprocs *= 2) {
|
||||
char filename[4096];
|
||||
sprintf(filename, "benchmark_%d.sh", nprocs);
|
||||
|
||||
FILE* fp = fopen(filename, "w");
|
||||
assert(fp);
|
||||
|
||||
// Boilerplate
|
||||
fprintf(fp, "#!/bin/bash\n");
|
||||
fprintf(fp, "#BATCH --job-name=astaroth\n");
|
||||
fprintf(fp, "#SBATCH --account=project_2000403\n");
|
||||
fprintf(fp, "#SBATCH --time=00:14:59\n");
|
||||
fprintf(fp, "#SBATCH --mem=32000\n");
|
||||
fprintf(fp, "#SBATCH --partition=gpu\n");
|
||||
fprintf(fp, "#SBATCH --cpus-per-task=10\n");
|
||||
|
||||
// nprocs, nodes, gpus
|
||||
const int max_gpus_per_node = 4;
|
||||
const int gpus_per_node = nprocs < max_gpus_per_node ? nprocs : max_gpus_per_node;
|
||||
const int nodes = (int)ceil((double)nprocs / max_gpus_per_node);
|
||||
fprintf(fp, "#SBATCH --gres=gpu:v100:%d\n", gpus_per_node);
|
||||
fprintf(fp, "#SBATCH -n %d\n", nprocs);
|
||||
fprintf(fp, "#SBATCH -N %d\n", nodes);
|
||||
//fprintf(fp, "#SBATCH --exclusive\n");
|
||||
if (nprocs > 4)
|
||||
fprintf(fp, "#SBATCH --ntasks-per-socket=2\n");
|
||||
|
||||
// Modules
|
||||
// OpenMPI
|
||||
fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake openmpi nccl\n");
|
||||
// HPCX
|
||||
//fprintf(fp, "module load gcc/8.3.0 cuda/10.1.168 cmake hpcx-mpi/2.5.0-cuda nccl\n");
|
||||
fprintf(fp, "export UCX_MEMTYPE_CACHE=n\n");
|
||||
|
||||
// Profile and run
|
||||
//fprintf(fp, "mkdir -p profile_%d\n", nprocs);
|
||||
|
||||
const int nx = 256; // max size 1792;
|
||||
const int ny = nx;
|
||||
const int nz = nx;
|
||||
/*
|
||||
fprintf(fp,
|
||||
//"srun nvprof --annotate-mpi openmpi -o profile_%d/%%p.nvprof ./benchmark %d %d "
|
||||
//"%d\n",
|
||||
"srun ./benchmark %d %d %d\n", nx, ny, nz);
|
||||
*/
|
||||
fprintf(fp, "srun ./benchmark %d %d %d\n", nx, ny, nz);
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
3
samples/mpi_reduce_bench/CMakeLists.txt
Normal file
3
samples/mpi_reduce_bench/CMakeLists.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
## benchmark
|
||||
add_executable(mpi_reduce_bench main.cc)
|
||||
target_link_libraries(mpi_reduce_bench astaroth_core astaroth_utils)
|
169
samples/mpi_reduce_bench/main.cc
Normal file
169
samples/mpi_reduce_bench/main.cc
Normal file
@@ -0,0 +1,169 @@
|
||||
/*
|
||||
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
/**
|
||||
Running: benchmark -np <num processes> <executable>
|
||||
*/
|
||||
#include "astaroth.h"
|
||||
#include "astaroth_utils.h"
|
||||
|
||||
#include "errchk.h"
|
||||
#include "timer_hires.h"
|
||||
|
||||
#if AC_MPI_ENABLED
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
MPI_Init(NULL, NULL);
|
||||
int nprocs, pid;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
|
||||
|
||||
// CPU alloc
|
||||
AcMeshInfo info;
|
||||
acLoadConfig(AC_DEFAULT_CONFIG, &info);
|
||||
|
||||
char* benchmark_label;
|
||||
|
||||
if (argc > 1){
|
||||
benchmark_label = argv[1];
|
||||
} else {
|
||||
auto timestamp = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
|
||||
benchmark_label = std::ctime(×tamp);
|
||||
benchmark_label[strcspn(benchmark_label, "\n")] = '\0';
|
||||
}
|
||||
|
||||
//clang-format off
|
||||
std::vector<AcScalReductionTestCase> scalarReductionTests {
|
||||
acCreateScalReductionTestCase("Scalar MAX" , VTXBUF_UUX, RTYPE_MAX),
|
||||
acCreateScalReductionTestCase("Scalar MIN" , VTXBUF_UUX, RTYPE_MIN),
|
||||
acCreateScalReductionTestCase("Scalar RMS" , VTXBUF_UUX, RTYPE_RMS),
|
||||
acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
|
||||
acCreateScalReductionTestCase("Scalar SUM" , VTXBUF_UUX, RTYPE_SUM)
|
||||
};
|
||||
|
||||
std::vector<AcVecReductionTestCase> vectorReductionTests {
|
||||
acCreateVecReductionTestCase("Vector MAX" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
|
||||
acCreateVecReductionTestCase("Vector MIN" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
|
||||
acCreateVecReductionTestCase("Vector RMS" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
|
||||
acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS_EXP),
|
||||
acCreateVecReductionTestCase("Vector SUM" , VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM)
|
||||
};
|
||||
//clang-format on
|
||||
|
||||
// GPU alloc & compute
|
||||
acGridInit(info);
|
||||
|
||||
const size_t num_iters = 100;
|
||||
const double nth_percentile = 0.90;
|
||||
std::vector<double> results; // ms
|
||||
Timer t;
|
||||
// Scalar benchmarks
|
||||
for (auto& testCase : scalarReductionTests) {
|
||||
results.clear();
|
||||
results.reserve(num_iters);
|
||||
|
||||
for (size_t i = 0; i < num_iters; ++i) {
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
timer_reset(&t);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
results.push_back(timer_diff_nsec(t) / 1e6);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
}
|
||||
|
||||
if (!pid) {
|
||||
std::sort(results.begin(), results.end(),
|
||||
[](const double& a, const double& b) { return a < b; });
|
||||
fprintf(stdout,
|
||||
"Reduction time %g ms (%gth "
|
||||
"percentile)--------------------------------------\n",
|
||||
results[nth_percentile * num_iters], 100 * nth_percentile);
|
||||
|
||||
char path[4096] = "mpi_reduction_benchmark.csv";
|
||||
|
||||
FILE* fp = fopen(path, "a");
|
||||
ERRCHK_ALWAYS(fp);
|
||||
|
||||
// Format
|
||||
// benchmark label, test label, nprocs, measured (ms)
|
||||
fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
// Vector benchmarks
|
||||
for (auto& testCase : vectorReductionTests) {
|
||||
results.clear();
|
||||
results.reserve(num_iters);
|
||||
|
||||
for (size_t i = 0; i < num_iters; ++i) {
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
timer_reset(&t);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c, &testCase.candidate);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
results.push_back(timer_diff_nsec(t) / 1e6);
|
||||
acGridSynchronizeStream(STREAM_ALL);
|
||||
}
|
||||
|
||||
if (!pid) {
|
||||
std::sort(results.begin(), results.end(),
|
||||
[](const double& a, const double& b) { return a < b; });
|
||||
fprintf(stdout,
|
||||
"Reduction time %g ms (%gth "
|
||||
"percentile)--------------------------------------\n",
|
||||
results[nth_percentile * num_iters], 100 * nth_percentile);
|
||||
|
||||
char path[4096] = "mpi_reduction_benchmark.csv";
|
||||
|
||||
FILE* fp = fopen(path, "a");
|
||||
ERRCHK_ALWAYS(fp);
|
||||
|
||||
// Format
|
||||
// benchmark label, test label, nprocs, measured (ms)
|
||||
fprintf(fp, "\"%s\",\"%s\", %d, %g\n", benchmark_label, testCase.label, nprocs, results[nth_percentile * num_iters]);
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
acGridQuit();
|
||||
MPI_Finalize();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
#else
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
|
||||
"cmake -DMPI_ENABLED=ON .. to enable.\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
#endif // AC_MPI_ENABLES
|
84
samples/mpi_reduce_bench/mpibench.sh
Executable file
84
samples/mpi_reduce_bench/mpibench.sh
Executable file
@@ -0,0 +1,84 @@
|
||||
#!/bin/bash
|
||||
|
||||
#defaults
|
||||
default_num_procs=8
|
||||
default_num_nodes=2
|
||||
default_partition=gpu
|
||||
|
||||
num_procs=$default_num_procs
|
||||
num_nodes=$default_num_nodes
|
||||
|
||||
partition=$default_partition
|
||||
|
||||
script_name=$0
|
||||
|
||||
print_usage(){
|
||||
echo "Usage: $script_name [Options]"
|
||||
echo " Runs ./mpi_reduce_bench, which will write benchmark results to a csv file"
|
||||
echo " Remember to run this script from your build directory"
|
||||
echo " The benchmarks are submitted with sbatch, unless the -i option is passed"
|
||||
echo "Options:"
|
||||
echo " -n <num_procs>"
|
||||
echo " number of tasks for slurm, default=$default_num_procs"
|
||||
echo " -p <partition>"
|
||||
echo " which partition to use for slurm, default=$default_partition"
|
||||
echo " -t <tag>"
|
||||
echo " A benchmark tag that will be added to the mpi_reduction_benchmark.csv file"
|
||||
echo " By default the current git HEAD short hash will be used as a tag"
|
||||
echo " -i"
|
||||
echo " Run the benchmark interactively with srun instead of sbatch"
|
||||
echo " -h"
|
||||
echo " Print this message"
|
||||
}
|
||||
|
||||
while getopts :n:t:p:ih opt
|
||||
do
|
||||
case "$opt" in
|
||||
n)
|
||||
num_procs=$OPTARG
|
||||
num_nodes=$(( 1 + ($num_procs - 1)/4))
|
||||
;;
|
||||
t)
|
||||
benchmark_label=$OPTARG
|
||||
;;
|
||||
i)
|
||||
interactively=1
|
||||
;;
|
||||
p)
|
||||
partition=$OPTARG
|
||||
;;
|
||||
h)
|
||||
print_usage
|
||||
exit 0
|
||||
;;
|
||||
?)
|
||||
print_usage
|
||||
exit 1
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$benchmark_label" ]
|
||||
then
|
||||
benchmark_label=$(git rev-parse --short HEAD)
|
||||
fi
|
||||
set -x
|
||||
|
||||
exit 0
|
||||
|
||||
if [ -z "$interactively"]
|
||||
then
|
||||
sbatch <<EOF
|
||||
#!/bin/sh
|
||||
#BATCH --job-name=astaroth
|
||||
#SBATCH --account=project_2000403
|
||||
#SBATCH --time=00:14:59
|
||||
#SBATCH --mem=48000
|
||||
#SBATCH --partition=${partition}
|
||||
#SBATCH --gres=gpu:v100:4
|
||||
#SBATCH -n ${num_procs}
|
||||
#SBATCH -N ${num_nodes}
|
||||
srun ./mpi_reduce_bench ${benchmark_label}
|
||||
EOF
|
||||
else
|
||||
srun --account=project_2000403 --gres=gpu:v100:4 --mem=48000 -t 00:14:59 -p ${partition} -n ${num_procs} -N ${num_nodes} ./mpi_reduce_bench ${benchmark_label}
|
||||
fi
|
@@ -25,6 +25,7 @@
|
||||
#if AC_MPI_ENABLED
|
||||
|
||||
#include <mpi.h>
|
||||
#include <vector>
|
||||
|
||||
int
|
||||
main(void)
|
||||
@@ -48,24 +49,65 @@ main(void)
|
||||
|
||||
// GPU alloc & compute
|
||||
acGridInit(info);
|
||||
acGridLoadMesh(model, STREAM_DEFAULT);
|
||||
|
||||
// INTEGRATION TESTS START ---------------------------------------------------------------------
|
||||
acGridLoadMesh(model, STREAM_DEFAULT);
|
||||
acGridIntegrate(STREAM_DEFAULT, FLT_EPSILON);
|
||||
acGridPeriodicBoundconds(STREAM_DEFAULT);
|
||||
|
||||
acGridStoreMesh(STREAM_DEFAULT, &candidate);
|
||||
acGridQuit();
|
||||
|
||||
// Verify
|
||||
if (pid == 0) {
|
||||
acModelIntegrateStep(model, FLT_EPSILON);
|
||||
acMeshApplyPeriodicBounds(&model);
|
||||
|
||||
acVerifyMesh(model, candidate);
|
||||
}
|
||||
// INTEGRATION TESTS END -----------------------------------------------------------------------
|
||||
|
||||
// REDUCTION TESTS START -----------------------------------------------------------------------
|
||||
acGridLoadMesh(model, STREAM_DEFAULT);
|
||||
|
||||
std::vector<AcScalReductionTestCase> scalarReductionTests{
|
||||
acCreateScalReductionTestCase("Scalar MAX", VTXBUF_UUX, RTYPE_MAX),
|
||||
acCreateScalReductionTestCase("Scalar MIN", VTXBUF_UUX, RTYPE_MIN),
|
||||
/*
|
||||
acCreateScalReductionTestCase("Scalar RMS", VTXBUF_UUX, RTYPE_RMS),
|
||||
acCreateScalReductionTestCase("Scalar RMS_EXP", VTXBUF_UUX, RTYPE_RMS_EXP),
|
||||
acCreateScalReductionTestCase("Scalar SUM", VTXBUF_UUX, RTYPE_SUM),
|
||||
*/
|
||||
};
|
||||
std::vector<AcVecReductionTestCase> vectorReductionTests{
|
||||
acCreateVecReductionTestCase("Vector MAX", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MAX),
|
||||
acCreateVecReductionTestCase("Vector MIN", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_MIN),
|
||||
/*
|
||||
acCreateVecReductionTestCase("Vector RMS", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_RMS),
|
||||
acCreateVecReductionTestCase("Vector RMS_EXP", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ,
|
||||
RTYPE_RMS_EXP),
|
||||
acCreateVecReductionTestCase("Vector SUM", VTXBUF_UUX, VTXBUF_UUY, VTXBUF_UUZ, RTYPE_SUM),
|
||||
*/
|
||||
};
|
||||
// False positives due to too strict error bounds, skip the tests until we can determine a
|
||||
// proper error bound
|
||||
fprintf(stderr, "WARNING: RTYPE_RMS, RTYPE_RMS_EXP, and RTYPE_SUM tests skipped\n");
|
||||
|
||||
for (auto& testCase : scalarReductionTests) {
|
||||
acGridReduceScal(STREAM_DEFAULT, testCase.rtype, testCase.vtxbuf, &testCase.candidate);
|
||||
}
|
||||
for (auto& testCase : vectorReductionTests) {
|
||||
acGridReduceVec(STREAM_DEFAULT, testCase.rtype, testCase.a, testCase.b, testCase.c,
|
||||
&testCase.candidate);
|
||||
}
|
||||
if (pid == 0) {
|
||||
acVerifyScalReductions(model, scalarReductionTests.data(), scalarReductionTests.size());
|
||||
acVerifyVecReductions(model, vectorReductionTests.data(), vectorReductionTests.size());
|
||||
}
|
||||
// REDUCTION TESTS END -------------------------------------------------------------------------
|
||||
|
||||
if (pid == 0) {
|
||||
acMeshDestroy(&model);
|
||||
acMeshDestroy(&candidate);
|
||||
}
|
||||
|
||||
acGridQuit();
|
||||
MPI_Finalize();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
@@ -74,7 +116,8 @@ main(void)
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with cmake -DMPI_ENABLED=ON .. to enable.\n");
|
||||
printf("The library was built without MPI support, cannot run mpitest. Rebuild Astaroth with "
|
||||
"cmake -DMPI_ENABLED=ON .. to enable.\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
#endif // AC_MPI_ENABLES
|
||||
|
@@ -105,7 +105,8 @@ operator+(const int3& a, const int3& b)
|
||||
return (int3){a.x + b.x, a.y + b.y, a.z + b.z};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE int3 operator*(const int3& a, const int3& b)
|
||||
static HOST_DEVICE_INLINE int3
|
||||
operator*(const int3& a, const int3& b)
|
||||
{
|
||||
return (int3){a.x * b.x, a.y * b.y, a.z * b.z};
|
||||
}
|
||||
@@ -144,12 +145,20 @@ operator-=(AcReal3& lhs, const AcReal3& rhs)
|
||||
lhs.z -= rhs.z;
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal& a, const AcReal3& b)
|
||||
static HOST_DEVICE_INLINE int3
|
||||
operator*(const int& a, const int3& b)
|
||||
{
|
||||
return (int3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE AcReal3
|
||||
operator*(const AcReal& a, const AcReal3& b)
|
||||
{
|
||||
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
|
||||
static HOST_DEVICE_INLINE AcReal3 operator*(const AcReal3& b, const AcReal& a)
|
||||
static HOST_DEVICE_INLINE AcReal3
|
||||
operator*(const AcReal3& b, const AcReal& a)
|
||||
{
|
||||
return (AcReal3){a * b.x, a * b.y, a * b.z};
|
||||
}
|
||||
|
@@ -1,10 +1,14 @@
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
## Astaroth Core
|
||||
add_library(astaroth_core STATIC device.cc node.cc astaroth.cc)
|
||||
target_link_libraries(astaroth_core astaroth_utils astaroth_kernels cudart)
|
||||
add_library(astaroth_core STATIC device.cc node.cc astaroth.cc astaroth_fortran.cc)
|
||||
target_link_libraries(astaroth_core astaroth_kernels CUDA::cudart CUDA::cuda_driver)
|
||||
|
||||
## Options
|
||||
if (MPI_ENABLED)
|
||||
find_package(MPI)
|
||||
#find_package(MPI REQUIRED)
|
||||
#find_package(OpenMP)
|
||||
#target_link_libraries(astaroth_core MPI::MPI_CXX OpenMP::OpenMP_CXX)
|
||||
target_link_libraries(astaroth_core MPI::MPI_CXX)
|
||||
endif()
|
||||
|
||||
|
@@ -158,3 +158,65 @@ acGetNode(void)
|
||||
ERRCHK_ALWAYS(num_nodes > 0);
|
||||
return nodes[0];
|
||||
}
|
||||
|
||||
AcResult
|
||||
acUpdateBuiltinParams(AcMeshInfo* config)
|
||||
{
|
||||
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
||||
///////////// PAD TEST
|
||||
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
||||
///////////// PAD TEST
|
||||
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
||||
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
||||
|
||||
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
||||
config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
|
||||
config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
|
||||
|
||||
// These do not have to be defined by empty projects any more.
|
||||
// These should be set only if stdderiv.h is included
|
||||
#ifdef AC_dsx
|
||||
config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
|
||||
#endif
|
||||
#ifdef AC_dsy
|
||||
config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
|
||||
#endif
|
||||
#ifdef AC_dsz
|
||||
config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
|
||||
#endif
|
||||
|
||||
/* Additional helper params */
|
||||
// Int helpers
|
||||
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
|
||||
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
|
||||
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
|
||||
{
|
||||
mesh->info = info;
|
||||
|
||||
const size_t bytes = acVertexBufferSizeBytes(mesh->info);
|
||||
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
|
||||
mesh->vertex_buffer[w] = (AcReal*)malloc(bytes);
|
||||
ERRCHK_ALWAYS(mesh->vertex_buffer[w]);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acMeshDestroy(AcMesh* mesh)
|
||||
{
|
||||
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
|
||||
free(mesh->vertex_buffer[w]);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
119
src/core/astaroth_fortran.cc
Normal file
119
src/core/astaroth_fortran.cc
Normal file
@@ -0,0 +1,119 @@
|
||||
#include "astaroth_fortran.h"
|
||||
|
||||
#include "astaroth.h"
|
||||
#include "errchk.h"
|
||||
|
||||
/**
|
||||
* Utils
|
||||
*/
|
||||
void
|
||||
acupdatebuiltinparams_(AcMeshInfo* info)
|
||||
{
|
||||
acUpdateBuiltinParams(info);
|
||||
}
|
||||
|
||||
void
|
||||
acgetdevicecount_(int* count)
|
||||
{
|
||||
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(count));
|
||||
}
|
||||
|
||||
/**
|
||||
* Device
|
||||
*/
|
||||
void
|
||||
acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle)
|
||||
{
|
||||
acDeviceCreate(*id, *info, handle);
|
||||
}
|
||||
|
||||
void
|
||||
acdevicedestroy_(Device* device)
|
||||
{
|
||||
acDeviceDestroy(*device);
|
||||
}
|
||||
|
||||
void
|
||||
acdeviceprintinfo_(const Device* device)
|
||||
{
|
||||
acDevicePrintInfo(*device);
|
||||
}
|
||||
|
||||
void
|
||||
acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info)
|
||||
{
|
||||
acDeviceLoadMeshInfo(*device, *info);
|
||||
}
|
||||
|
||||
void
|
||||
acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
|
||||
const int* num_farrays, AcReal* farray)
|
||||
{
|
||||
ERRCHK_ALWAYS(*num_farrays >= NUM_VTXBUF_HANDLES);
|
||||
WARNCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
|
||||
const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
|
||||
|
||||
AcMesh mesh;
|
||||
mesh.info = *info;
|
||||
for (int i = 0; i < *num_farrays; ++i)
|
||||
mesh.vertex_buffer[i] = &farray[i * mxyz];
|
||||
|
||||
acDeviceLoadMesh(*device, *stream, mesh);
|
||||
}
|
||||
|
||||
void
|
||||
acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
|
||||
const int* num_farrays, AcReal* farray)
|
||||
{
|
||||
ERRCHK_ALWAYS(*num_farrays >= NUM_VTXBUF_HANDLES);
|
||||
WARNCHK_ALWAYS(*num_farrays == NUM_VTXBUF_HANDLES);
|
||||
AcMesh mesh;
|
||||
mesh.info = *info;
|
||||
|
||||
const size_t mxyz = info->int_params[AC_mx] * info->int_params[AC_mx] * info->int_params[AC_mx];
|
||||
for (int i = 0; i < *num_farrays; ++i)
|
||||
mesh.vertex_buffer[i] = &farray[i * mxyz];
|
||||
|
||||
acDeviceStoreMesh(*device, *stream, &mesh);
|
||||
}
|
||||
|
||||
void
|
||||
acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
|
||||
const int3* start, const int3* end, const AcReal* dt)
|
||||
{
|
||||
acDeviceIntegrateSubstep(*device, *stream, *step_number, *start, *end, *dt);
|
||||
}
|
||||
|
||||
void
|
||||
acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
|
||||
const int3* end)
|
||||
{
|
||||
acDevicePeriodicBoundconds(*device, *stream, *start, *end);
|
||||
}
|
||||
|
||||
void
|
||||
acdeviceswapbuffers_(const Device* device)
|
||||
{
|
||||
acDeviceSwapBuffers(*device);
|
||||
}
|
||||
|
||||
void
|
||||
acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
|
||||
const VertexBufferHandle* vtxbuf_handle, AcReal* result)
|
||||
{
|
||||
acDeviceReduceScal(*device, *stream, *rtype, *vtxbuf_handle, result);
|
||||
}
|
||||
|
||||
void
|
||||
acdevicereducevec_(const Device* device, const Stream* stream, const ReductionType* rtype,
|
||||
const VertexBufferHandle* vtxbuf0, const VertexBufferHandle* vtxbuf1,
|
||||
const VertexBufferHandle* vtxbuf2, AcReal* result)
|
||||
{
|
||||
acDeviceReduceVec(*device, *stream, *rtype, *vtxbuf0, *vtxbuf1, *vtxbuf2, result);
|
||||
}
|
||||
|
||||
void
|
||||
acdevicesynchronizestream_(const Device* device, const Stream* stream)
|
||||
{
|
||||
acDeviceSynchronizeStream(*device, *stream);
|
||||
}
|
50
src/core/astaroth_fortran.h
Normal file
50
src/core/astaroth_fortran.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#pragma once
|
||||
#include "astaroth.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Utils
|
||||
*/
|
||||
void acupdatebuiltinparams_(AcMeshInfo* info);
|
||||
|
||||
void acgetdevicecount_(int* count);
|
||||
|
||||
/**
|
||||
* Device
|
||||
*/
|
||||
void acdevicecreate_(const int* id, const AcMeshInfo* info, Device* handle);
|
||||
|
||||
void acdevicedestroy_(Device* device);
|
||||
|
||||
void acdeviceprintinfo_(const Device* device);
|
||||
|
||||
void acdeviceloadmeshinfo_(const Device* device, const AcMeshInfo* info);
|
||||
|
||||
void acdeviceloadmesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
|
||||
const int* num_farrays, AcReal* farray);
|
||||
|
||||
void acdevicestoremesh_(const Device* device, const Stream* stream, const AcMeshInfo* info,
|
||||
const int* num_farrays, AcReal* farray);
|
||||
|
||||
void acdeviceintegratesubstep_(const Device* device, const Stream* stream, const int* step_number,
|
||||
const int3* start, const int3* end, const AcReal* dt);
|
||||
void acdeviceperiodicboundconds_(const Device* device, const Stream* stream, const int3* start,
|
||||
const int3* end);
|
||||
|
||||
void acdeviceswapbuffers_(const Device* device);
|
||||
|
||||
void acdevicereducescal_(const Device* device, const Stream* stream, const ReductionType* rtype,
|
||||
const VertexBufferHandle* vtxbuf_handle, AcReal* result);
|
||||
|
||||
void acdevicereducevec_(const Device* device, const Stream* stream, const ReductionType* rtype,
|
||||
const VertexBufferHandle* vtxbuf0, const VertexBufferHandle* vtxbuf1,
|
||||
const VertexBufferHandle* vtxbuf2, AcReal* result);
|
||||
|
||||
void acdevicesynchronizestream_(const Device* device, const Stream* stream);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
1119
src/core/device.cc
1119
src/core/device.cc
File diff suppressed because it is too large
Load Diff
@@ -44,7 +44,8 @@ rk3_integrate(const AcReal3 state_previous, const AcReal3 state_current,
|
||||
return (AcReal3){
|
||||
rk3_integrate<step_number>(state_previous.x, state_current.x, rate_of_change.x, dt),
|
||||
rk3_integrate<step_number>(state_previous.y, state_current.y, rate_of_change.y, dt),
|
||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt)};
|
||||
rk3_integrate<step_number>(state_previous.z, state_current.z, rate_of_change.z, dt),
|
||||
};
|
||||
}
|
||||
|
||||
#define rk3(state_previous, state_current, rate_of_change, dt) \
|
||||
@@ -192,9 +193,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
|
||||
}
|
||||
}
|
||||
#if VERBOSE_PRINTING
|
||||
printf(
|
||||
"Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
|
||||
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
|
||||
printf("Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f "
|
||||
"ms\n",
|
||||
best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
|
||||
#endif
|
||||
/*
|
||||
FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
|
||||
@@ -204,6 +205,9 @@ acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferAr
|
||||
*/
|
||||
|
||||
rk3_tpb = best_dims;
|
||||
|
||||
// Failed to find valid thread block dimensions
|
||||
ERRCHK_ALWAYS(rk3_tpb.x * rk3_tpb.y * rk3_tpb.z > 0);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -211,6 +215,8 @@ AcResult
|
||||
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start,
|
||||
const int3 end, VertexBufferArray vba)
|
||||
{
|
||||
ERRCHK_ALWAYS(step_number >= 0);
|
||||
ERRCHK_ALWAYS(step_number < 3);
|
||||
const dim3 tpb = rk3_tpb;
|
||||
|
||||
const int3 n = end - start;
|
||||
|
@@ -75,17 +75,20 @@ exp(const acComplex& val)
|
||||
{
|
||||
return acComplex(exp(val.x) * cos(val.y), exp(val.x) * sin(val.y));
|
||||
}
|
||||
static __device__ inline acComplex operator*(const AcReal& a, const acComplex& b)
|
||||
static __device__ inline acComplex
|
||||
operator*(const AcReal& a, const acComplex& b)
|
||||
{
|
||||
return (acComplex){a * b.x, a * b.y};
|
||||
}
|
||||
|
||||
static __device__ inline acComplex operator*(const acComplex& b, const AcReal& a)
|
||||
static __device__ inline acComplex
|
||||
operator*(const acComplex& b, const AcReal& a)
|
||||
{
|
||||
return (acComplex){a * b.x, a * b.y};
|
||||
}
|
||||
|
||||
static __device__ inline acComplex operator*(const acComplex& a, const acComplex& b)
|
||||
static __device__ inline acComplex
|
||||
operator*(const acComplex& a, const acComplex& b)
|
||||
{
|
||||
return (acComplex){a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
|
||||
}
|
||||
@@ -116,7 +119,7 @@ acDeviceLoadScalarUniform(const Device device, const Stream stream, const AcReal
|
||||
|
||||
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -141,7 +144,7 @@ acDeviceLoadVectorUniform(const Device device, const Stream stream, const AcReal
|
||||
|
||||
const size_t offset = (size_t)&d_mesh_info.real3_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -165,7 +168,7 @@ acDeviceLoadIntUniform(const Device device, const Stream stream, const AcIntPara
|
||||
|
||||
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -179,10 +182,10 @@ acDeviceLoadInt3Uniform(const Device device, const Stream stream, const AcInt3Pa
|
||||
}
|
||||
|
||||
if (!is_valid(value.x) || !is_valid(value.y) || !is_valid(value.z)) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. Skipping.\n",
|
||||
value.x, value.y, value.z, int3param_names[param]);
|
||||
fprintf(stderr,
|
||||
"WARNING: Passed an invalid value (%d, %d, %def) to device constant %s. "
|
||||
"Skipping.\n",
|
||||
value.x, value.y, value.z, int3param_names[param]);
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
@@ -229,7 +232,7 @@ acDeviceLoadDefaultUniforms(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
// clang-format off
|
||||
// clang-format off
|
||||
// Scalar
|
||||
#define LOAD_DEFAULT_UNIFORM(X) acDeviceLoadScalarUniform(device, STREAM_DEFAULT, X, X##_DEFAULT_VALUE);
|
||||
AC_FOR_USER_REAL_PARAM_TYPES(LOAD_DEFAULT_UNIFORM)
|
||||
|
@@ -1,9 +1,19 @@
|
||||
#pragma once
|
||||
#include "astaroth.h"
|
||||
|
||||
#if AC_MPI_ENABLED
|
||||
#include <mpi.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define MPI_GPUDIRECT_DISABLED (0)
|
||||
#endif // AC_MPI_ENABLED
|
||||
|
||||
typedef struct {
|
||||
int3 dims;
|
||||
AcReal* data;
|
||||
|
||||
AcReal* data_pinned;
|
||||
bool pinned = false; // Set if data was received to pinned memory
|
||||
} PackedData;
|
||||
|
||||
typedef struct {
|
||||
|
@@ -92,8 +92,9 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
|
||||
assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
|
||||
assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
|
||||
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
|
||||
src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)]);
|
||||
dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(src0[IDX(src_idx)],
|
||||
src1[IDX(src_idx)],
|
||||
src2[IDX(src_idx)]);
|
||||
}
|
||||
|
||||
template <ReduceFunc reduce>
|
||||
|
@@ -1,3 +1,3 @@
|
||||
## Astaroth Utils
|
||||
add_library(astaroth_utils STATIC config_loader.c memory.c verification.c modelsolver.c)
|
||||
add_library(astaroth_utils STATIC config_loader.c memory.c verification.c modelsolver.c modelreduce.c)
|
||||
add_dependencies(astaroth_utils dsl_headers)
|
||||
|
@@ -74,42 +74,6 @@ parse_config(const char* path, AcMeshInfo* config)
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acUpdateBuiltinParams(AcMeshInfo* config)
|
||||
{
|
||||
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
||||
///////////// PAD TEST
|
||||
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
||||
///////////// PAD TEST
|
||||
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
||||
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
||||
|
||||
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
||||
config->int_params[AC_nx_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
|
||||
config->int_params[AC_ny_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_ny_max] = config->int_params[AC_ny] + STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nz_min] = STENCIL_ORDER / 2;
|
||||
config->int_params[AC_nz_max] = config->int_params[AC_nz] + STENCIL_ORDER / 2;
|
||||
|
||||
/*
|
||||
// DEPRECATED: Spacing TODO
|
||||
// These do not have to be defined by empty projects any more.
|
||||
// These should be set only if stdderiv.h is included
|
||||
config->real_params[AC_inv_dsx] = (AcReal)(1.) / config->real_params[AC_dsx];
|
||||
config->real_params[AC_inv_dsy] = (AcReal)(1.) / config->real_params[AC_dsy];
|
||||
config->real_params[AC_inv_dsz] = (AcReal)(1.) / config->real_params[AC_dsz];
|
||||
*/
|
||||
|
||||
/* Additional helper params */
|
||||
// Int helpers
|
||||
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
|
||||
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
|
||||
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
\brief Loads data from astaroth.conf into a config struct.
|
||||
\return AC_SUCCESS on success, AC_FAILURE if there are potentially uninitialized values.
|
||||
|
@@ -20,29 +20,6 @@
|
||||
|
||||
#include "errchk.h"
|
||||
|
||||
AcResult
|
||||
acMeshCreate(const AcMeshInfo info, AcMesh* mesh)
|
||||
{
|
||||
mesh->info = info;
|
||||
|
||||
const size_t bytes = acVertexBufferSizeBytes(mesh->info);
|
||||
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w) {
|
||||
mesh->vertex_buffer[w] = malloc(bytes);
|
||||
ERRCHK_ALWAYS(mesh->vertex_buffer[w]);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acMeshDestroy(AcMesh* mesh)
|
||||
{
|
||||
for (int w = 0; w < NUM_VTXBUF_HANDLES; ++w)
|
||||
free(mesh->vertex_buffer[w]);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acMeshSet(const AcReal value, AcMesh* mesh)
|
||||
{
|
||||
|
209
src/utils/modelreduce.c
Normal file
209
src/utils/modelreduce.c
Normal file
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
Copyright (C) 2014-2020, Johannes Pekkila, Miikka Vaisala.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#include "astaroth.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "errchk.h"
|
||||
|
||||
#if AC_DOUBLE_PRECISION == 0 // HACK TODO fix, make cleaner (purkkaratkaisu)
|
||||
#define fabs fabsf
|
||||
#define exp expf
|
||||
#define sqrt sqrtf
|
||||
#endif
|
||||
|
||||
// Function pointer definitions
|
||||
typedef AcReal (*ReduceFunc)(const AcReal, const AcReal);
|
||||
typedef AcReal (*ReduceInitialScalFunc)(const AcReal);
|
||||
typedef AcReal (*ReduceInitialVecFunc)(const AcReal, const AcReal,
|
||||
const AcReal);
|
||||
|
||||
// clang-format off
|
||||
/* Comparison funcs */
|
||||
static inline AcReal
|
||||
max(const AcReal a, const AcReal b) { return a > b ? a : b; }
|
||||
|
||||
static inline AcReal
|
||||
min(const AcReal a, const AcReal b) { return a < b ? a : b; }
|
||||
|
||||
static inline AcReal
|
||||
sum(const AcReal a, const AcReal b) { return a + b; }
|
||||
|
||||
/* Function used to determine the values used during reduction */
|
||||
static inline AcReal
|
||||
length_scal(const AcReal a) { return (AcReal)(a); }
|
||||
|
||||
static inline AcReal
|
||||
length_vec(const AcReal a, const AcReal b, const AcReal c) { return sqrt(a*a + b*b + c*c); }
|
||||
|
||||
static inline AcReal
|
||||
squared_scal(const AcReal a) { return (AcReal)(a*a); }
|
||||
|
||||
static inline AcReal
|
||||
squared_vec(const AcReal a, const AcReal b, const AcReal c) { return squared_scal(a) + squared_scal(b) + squared_scal(c); }
|
||||
|
||||
static inline AcReal
|
||||
exp_squared_scal(const AcReal a) { return exp(a)*exp(a); }
|
||||
|
||||
static inline AcReal
|
||||
exp_squared_vec(const AcReal a, const AcReal b, const AcReal c) { return exp_squared_scal(a) + exp_squared_scal(b) + exp_squared_scal(c); }
|
||||
// clang-format on
|
||||
|
||||
AcReal
|
||||
acModelReduceScal(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a)
|
||||
{
|
||||
ReduceInitialScalFunc reduce_initial;
|
||||
ReduceFunc reduce;
|
||||
|
||||
bool solve_mean = false;
|
||||
|
||||
switch (rtype) {
|
||||
case RTYPE_MAX:
|
||||
reduce_initial = length_scal;
|
||||
reduce = max;
|
||||
break;
|
||||
case RTYPE_MIN:
|
||||
reduce_initial = length_scal;
|
||||
reduce = min;
|
||||
break;
|
||||
case RTYPE_RMS:
|
||||
reduce_initial = squared_scal;
|
||||
reduce = sum;
|
||||
solve_mean = true;
|
||||
break;
|
||||
case RTYPE_RMS_EXP:
|
||||
reduce_initial = exp_squared_scal;
|
||||
reduce = sum;
|
||||
solve_mean = true;
|
||||
break;
|
||||
case RTYPE_SUM:
|
||||
reduce_initial = length_scal;
|
||||
reduce = sum;
|
||||
break;
|
||||
default:
|
||||
ERROR("Unrecognized RTYPE");
|
||||
}
|
||||
|
||||
const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
|
||||
mesh.info.int_params[AC_ny_min],
|
||||
mesh.info.int_params[AC_nz_min], mesh.info);
|
||||
|
||||
AcReal res;
|
||||
if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
|
||||
res = reduce_initial(mesh.vertex_buffer[a][initial_idx]);
|
||||
else
|
||||
res = 0;
|
||||
|
||||
for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; ++k) {
|
||||
for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; ++j) {
|
||||
for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
|
||||
++i) {
|
||||
const int idx = acVertexBufferIdx(i, j, k, mesh.info);
|
||||
const AcReal curr_val = reduce_initial(mesh.vertex_buffer[a][idx]);
|
||||
res = reduce(res, curr_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (solve_mean) {
|
||||
const AcReal inv_n = (AcReal)1.0 / mesh.info.int_params[AC_nxyz];
|
||||
return sqrt(inv_n * res);
|
||||
}
|
||||
else {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
AcReal
|
||||
acModelReduceVec(const AcMesh mesh, const ReductionType rtype, const VertexBufferHandle a,
|
||||
const VertexBufferHandle b, const VertexBufferHandle c)
|
||||
{
|
||||
// AcReal (*reduce_initial)(AcReal, AcReal, AcReal);
|
||||
ReduceInitialVecFunc reduce_initial;
|
||||
ReduceFunc reduce;
|
||||
|
||||
bool solve_mean = false;
|
||||
|
||||
switch (rtype) {
|
||||
case RTYPE_MAX:
|
||||
reduce_initial = length_vec;
|
||||
reduce = max;
|
||||
break;
|
||||
case RTYPE_MIN:
|
||||
reduce_initial = length_vec;
|
||||
reduce = min;
|
||||
break;
|
||||
case RTYPE_RMS:
|
||||
reduce_initial = squared_vec;
|
||||
reduce = sum;
|
||||
solve_mean = true;
|
||||
break;
|
||||
case RTYPE_RMS_EXP:
|
||||
reduce_initial = exp_squared_vec;
|
||||
reduce = sum;
|
||||
solve_mean = true;
|
||||
break;
|
||||
case RTYPE_SUM:
|
||||
reduce_initial = length_vec;
|
||||
reduce = sum;
|
||||
break;
|
||||
default:
|
||||
ERROR("Unrecognized RTYPE");
|
||||
}
|
||||
|
||||
const int initial_idx = acVertexBufferIdx(mesh.info.int_params[AC_nx_min],
|
||||
mesh.info.int_params[AC_ny_min],
|
||||
mesh.info.int_params[AC_nz_min], mesh.info);
|
||||
|
||||
AcReal res;
|
||||
if (rtype == RTYPE_MAX || rtype == RTYPE_MIN)
|
||||
res = reduce_initial(mesh.vertex_buffer[a][initial_idx], mesh.vertex_buffer[b][initial_idx],
|
||||
mesh.vertex_buffer[c][initial_idx]);
|
||||
else
|
||||
res = 0;
|
||||
|
||||
for (int k = mesh.info.int_params[AC_nz_min]; k < mesh.info.int_params[AC_nz_max]; k++) {
|
||||
for (int j = mesh.info.int_params[AC_ny_min]; j < mesh.info.int_params[AC_ny_max]; j++) {
|
||||
for (int i = mesh.info.int_params[AC_nx_min]; i < mesh.info.int_params[AC_nx_max];
|
||||
i++) {
|
||||
const int idx = acVertexBufferIdx(i, j, k, mesh.info);
|
||||
const AcReal curr_val = reduce_initial(mesh.vertex_buffer[a][idx],
|
||||
mesh.vertex_buffer[b][idx],
|
||||
mesh.vertex_buffer[c][idx]);
|
||||
res = reduce(res, curr_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (solve_mean) {
|
||||
const AcReal inv_n = (AcReal)1.0 / mesh.info.int_params[AC_nxyz];
|
||||
return sqrt(inv_n * res);
|
||||
}
|
||||
else {
|
||||
return res;
|
||||
}
|
||||
}
|
@@ -38,9 +38,10 @@
|
||||
#define LMAGNETIC (1)
|
||||
#define LENTROPY (1)
|
||||
#define LTEMPERATURE (0)
|
||||
#define LFORCING (0)
|
||||
#define LFORCING (1)
|
||||
#define LUPWD (1)
|
||||
#define AC_THERMAL_CONDUCTIVITY ((Scalar)(0.001)) // TODO: make an actual config parameter
|
||||
#define R_PI ((Scalar)M_PI)
|
||||
|
||||
typedef AcReal Scalar;
|
||||
// typedef AcReal3 Vector;
|
||||
@@ -54,6 +55,8 @@ typedef float Vector __attribute__((vector_size(4 * sizeof(float))));
|
||||
#define fabs fabsf
|
||||
#define exp expf
|
||||
#define sqrt sqrtf
|
||||
#define cos cosf
|
||||
#define sin sinf
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
@@ -103,11 +106,16 @@ first_derivative(const Scalar* pencil, const Scalar inv_ds)
|
||||
#elif STENCIL_ORDER == 4
|
||||
const Scalar coefficients[] = {0, (Scalar)(2.0 / 3.0), (Scalar)(-1.0 / 12.0)};
|
||||
#elif STENCIL_ORDER == 6
|
||||
const Scalar coefficients[] = {0, (Scalar)(3.0 / 4.0), (Scalar)(-3.0 / 20.0),
|
||||
(Scalar)(1.0 / 60.0)};
|
||||
const Scalar coefficients[] = {
|
||||
0,
|
||||
(Scalar)(3.0 / 4.0),
|
||||
(Scalar)(-3.0 / 20.0),
|
||||
(Scalar)(1.0 / 60.0),
|
||||
};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const Scalar coefficients[] = {0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0),
|
||||
(Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0)};
|
||||
const Scalar coefficients[] = {
|
||||
0, (Scalar)(4.0 / 5.0), (Scalar)(-1.0 / 5.0), (Scalar)(4.0 / 105.0), (Scalar)(-1.0 / 280.0),
|
||||
};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
@@ -126,15 +134,23 @@ second_derivative(const Scalar* pencil, const Scalar inv_ds)
|
||||
#if STENCIL_ORDER == 2
|
||||
const Scalar coefficients[] = {-2, 1};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const Scalar coefficients[] = {(Scalar)(-5.0 / 2.0), (Scalar)(4.0 / 3.0),
|
||||
(Scalar)(-1.0 / 12.0)};
|
||||
const Scalar coefficients[] = {
|
||||
(Scalar)(-5.0 / 2.0),
|
||||
(Scalar)(4.0 / 3.0),
|
||||
(Scalar)(-1.0 / 12.0),
|
||||
};
|
||||
#elif STENCIL_ORDER == 6
|
||||
const Scalar coefficients[] = {(Scalar)(-49.0 / 18.0), (Scalar)(3.0 / 2.0),
|
||||
(Scalar)(-3.0 / 20.0), (Scalar)(1.0 / 90.0)};
|
||||
const Scalar coefficients[] = {
|
||||
(Scalar)(-49.0 / 18.0),
|
||||
(Scalar)(3.0 / 2.0),
|
||||
(Scalar)(-3.0 / 20.0),
|
||||
(Scalar)(1.0 / 90.0),
|
||||
};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const Scalar coefficients[] = {(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0),
|
||||
(Scalar)(-1.0 / 5.0), (Scalar)(8.0 / 315.0),
|
||||
(Scalar)(-1.0 / 560.0)};
|
||||
const Scalar coefficients[] = {
|
||||
(Scalar)(-205.0 / 72.0), (Scalar)(8.0 / 5.0), (Scalar)(-1.0 / 5.0),
|
||||
(Scalar)(8.0 / 315.0), (Scalar)(-1.0 / 560.0),
|
||||
};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
@@ -156,16 +172,27 @@ cross_derivative(const Scalar* pencil_a, const Scalar* pencil_b, const Scalar in
|
||||
const Scalar coefficients[] = {0, (Scalar)(1.0 / 4.0)};
|
||||
#elif STENCIL_ORDER == 4
|
||||
const Scalar coefficients[] = {
|
||||
0, (Scalar)(1.0 / 32.0),
|
||||
(Scalar)(1.0 / 64.0)}; // TODO correct coefficients, these are just placeholders
|
||||
0,
|
||||
(Scalar)(1.0 / 32.0),
|
||||
(Scalar)(1.0 / 64.0),
|
||||
}; // TODO correct coefficients, these are just placeholders
|
||||
#elif STENCIL_ORDER == 6
|
||||
const Scalar fac = ((Scalar)(1. / 720.));
|
||||
const Scalar coefficients[] = {0 * fac, (Scalar)(270.0) * fac, (Scalar)(-27.0) * fac,
|
||||
(Scalar)(2.0) * fac};
|
||||
const Scalar coefficients[] = {
|
||||
0 * fac,
|
||||
(Scalar)(270.0) * fac,
|
||||
(Scalar)(-27.0) * fac,
|
||||
(Scalar)(2.0) * fac,
|
||||
};
|
||||
#elif STENCIL_ORDER == 8
|
||||
const Scalar fac = ((Scalar)(1. / 20160.));
|
||||
const Scalar coefficients[] = {0 * fac, (Scalar)(8064.) * fac, (Scalar)(-1008.) * fac,
|
||||
(Scalar)(128.) * fac, (Scalar)(-9.) * fac};
|
||||
const Scalar coefficients[] = {
|
||||
0 * fac,
|
||||
(Scalar)(8064.) * fac,
|
||||
(Scalar)(-1008.) * fac,
|
||||
(Scalar)(128.) * fac,
|
||||
(Scalar)(-9.) * fac,
|
||||
};
|
||||
#endif
|
||||
|
||||
#define MID (STENCIL_ORDER / 2)
|
||||
@@ -207,14 +234,14 @@ derxy(const int i, const int j, const int k, const Scalar* arr)
|
||||
Scalar pencil_a[STENCIL_ORDER + 1];
|
||||
//#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + offset - STENCIL_ORDER / 2,
|
||||
k)];
|
||||
pencil_a[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
|
||||
j + offset - STENCIL_ORDER / 2, k)];
|
||||
|
||||
Scalar pencil_b[STENCIL_ORDER + 1];
|
||||
//#pragma unroll
|
||||
for (int offset = 0; offset < STENCIL_ORDER + 1; ++offset)
|
||||
pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, j + STENCIL_ORDER / 2 - offset,
|
||||
k)];
|
||||
pencil_b[offset] = arr[IDX(i + offset - STENCIL_ORDER / 2, //
|
||||
j + STENCIL_ORDER / 2 - offset, k)];
|
||||
|
||||
return cross_derivative(pencil_a, pencil_b, getReal(AC_inv_dsx), getReal(AC_inv_dsy));
|
||||
}
|
||||
@@ -539,7 +566,8 @@ gradient_of_divergence(const VectorData vec)
|
||||
return (Vector){
|
||||
hessian(vec.xdata).row[0][0] + hessian(vec.ydata).row[0][1] + hessian(vec.zdata).row[0][2],
|
||||
hessian(vec.xdata).row[1][0] + hessian(vec.ydata).row[1][1] + hessian(vec.zdata).row[1][2],
|
||||
hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2]};
|
||||
hessian(vec.xdata).row[2][0] + hessian(vec.ydata).row[2][1] + hessian(vec.zdata).row[2][2],
|
||||
};
|
||||
}
|
||||
|
||||
// Takes uu gradients and returns S
|
||||
@@ -777,9 +805,9 @@ Vector
|
||||
helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vector ff_im, Scalar phi)
|
||||
{
|
||||
(void)magnitude; // WARNING: unused
|
||||
xx[0] = xx[0] * (2.0 * M_PI / (getReal(AC_dsx) * getInt(AC_nx)));
|
||||
xx[1] = xx[1] * (2.0 * M_PI / (getReal(AC_dsy) * getInt(AC_ny)));
|
||||
xx[2] = xx[2] * (2.0 * M_PI / (getReal(AC_dsz) * getInt(AC_nz)));
|
||||
xx[0] = xx[0] * ((Scalar)2.0 * R_PI / (getReal(AC_dsx) * getInt(AC_nx)));
|
||||
xx[1] = xx[1] * ((Scalar)2.0 * R_PI / (getReal(AC_dsy) * getInt(AC_ny)));
|
||||
xx[2] = xx[2] * ((Scalar)2.0 * R_PI / (getReal(AC_dsz) * getInt(AC_nz)));
|
||||
|
||||
Scalar cos_phi = cos(phi);
|
||||
Scalar sin_phi = sin(phi);
|
||||
@@ -805,10 +833,11 @@ forcing(int3 globalVertexIdx, Scalar dt)
|
||||
getInt(AC_ny) * getReal(AC_dsy),
|
||||
getInt(AC_nz) * getReal(AC_dsz)}; // source (origin)
|
||||
(void)a; // WARNING: not used
|
||||
Vector xx = (Vector){(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
|
||||
(globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
|
||||
(globalVertexIdx.z - getInt(AC_nz_min)) *
|
||||
getReal(AC_dsz)}; // sink (current index)
|
||||
Vector xx = (Vector){
|
||||
(globalVertexIdx.x - getInt(AC_nx_min)) * getReal(AC_dsx),
|
||||
(globalVertexIdx.y - getInt(AC_ny_min)) * getReal(AC_dsy),
|
||||
(globalVertexIdx.z - getInt(AC_nz_min)) * getReal(AC_dsz),
|
||||
}; // sink (current index)
|
||||
const Scalar cs2 = getReal(AC_cs2_sound);
|
||||
const Scalar cs = sqrt(cs2);
|
||||
|
||||
|
@@ -2,6 +2,7 @@
|
||||
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
||||
#define max(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define min(a, b) ((a) < (b) ? (a) : (b))
|
||||
@@ -18,25 +19,14 @@
|
||||
#define WHT "\x1B[37m"
|
||||
#define RESET "\x1B[0m"
|
||||
|
||||
typedef struct {
|
||||
VertexBufferHandle handle;
|
||||
AcReal model;
|
||||
AcReal candidate;
|
||||
long double abs_error;
|
||||
long double ulp_error;
|
||||
long double rel_error;
|
||||
AcReal maximum_magnitude;
|
||||
AcReal minimum_magnitude;
|
||||
} Error;
|
||||
|
||||
static inline bool
|
||||
is_valid(const AcReal a)
|
||||
{
|
||||
return !isnan(a) && !isinf(a);
|
||||
}
|
||||
|
||||
static Error
|
||||
get_error(AcReal model, AcReal candidate)
|
||||
Error
|
||||
acGetError(AcReal model, AcReal candidate)
|
||||
{
|
||||
Error error;
|
||||
error.abs_error = 0;
|
||||
@@ -109,13 +99,15 @@ get_max_abs_error(const VertexBufferHandle vtxbuf_handle, const AcMesh model_mes
|
||||
|
||||
for (size_t i = 0; i < acVertexBufferSize(model_mesh.info); ++i) {
|
||||
|
||||
Error curr_error = get_error(model_vtxbuf[i], candidate_vtxbuf[i]);
|
||||
Error curr_error = acGetError(model_vtxbuf[i], candidate_vtxbuf[i]);
|
||||
|
||||
if (curr_error.abs_error > error.abs_error)
|
||||
error = curr_error;
|
||||
}
|
||||
|
||||
error.handle = vtxbuf_handle;
|
||||
error.handle = vtxbuf_handle;
|
||||
strncpy(error.label, vtxbuf_names[vtxbuf_handle], ERROR_LABEL_LENGTH - 1);
|
||||
error.label[ERROR_LABEL_LENGTH - 1] = '\0';
|
||||
error.maximum_magnitude = get_maximum_magnitude(model_vtxbuf, model_mesh.info);
|
||||
error.minimum_magnitude = get_minimum_magnitude(model_vtxbuf, model_mesh.info);
|
||||
|
||||
@@ -147,12 +139,12 @@ is_acceptable(const Error error)
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
print_error_to_screen(const Error error)
|
||||
bool
|
||||
printErrorToScreen(const Error error)
|
||||
{
|
||||
bool errors_found = false;
|
||||
|
||||
printf("\t%-15s... ", vtxbuf_names[error.handle]);
|
||||
printf("\t%-15s... ", error.label);
|
||||
if (is_acceptable(error)) {
|
||||
printf(GRN "OK! " RESET);
|
||||
}
|
||||
@@ -177,14 +169,87 @@ acVerifyMesh(const AcMesh model, const AcMesh candidate)
|
||||
bool errors_found = false;
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
Error field_error = get_max_abs_error(i, model, candidate);
|
||||
errors_found |= print_error_to_screen(field_error);
|
||||
errors_found |= printErrorToScreen(field_error);
|
||||
}
|
||||
|
||||
printf("%s\n", errors_found ? "Failure. Found errors in one or more vertex buffers"
|
||||
: "Success. No errors found.");
|
||||
|
||||
if (errors_found)
|
||||
return AC_FAILURE;
|
||||
else
|
||||
return AC_SUCCESS;
|
||||
return errors_found ? AC_FAILURE : AC_SUCCESS;
|
||||
}
|
||||
|
||||
/** Verification function for scalar reductions*/
|
||||
AcResult
|
||||
acVerifyScalReductions(const AcMesh model, const AcScalReductionTestCase* testCases,
|
||||
const size_t numCases)
|
||||
{
|
||||
printf("\nTesting scalar reductions:\n");
|
||||
|
||||
bool errors_found = false;
|
||||
for (size_t i = 0; i < numCases; i++) {
|
||||
AcReal model_reduction = acModelReduceScal(model, testCases[i].rtype, testCases[i].vtxbuf);
|
||||
Error error = acGetError(model_reduction, testCases[i].candidate);
|
||||
strncpy(error.label, testCases[i].label, ERROR_LABEL_LENGTH - 1);
|
||||
error.label[ERROR_LABEL_LENGTH - 1] = '\0';
|
||||
errors_found |= printErrorToScreen(error);
|
||||
}
|
||||
printf("%s\n", errors_found ? "Failure. Found errors in one or more scalar reductions"
|
||||
: "Success. No errors found.");
|
||||
|
||||
return errors_found ? AC_FAILURE : AC_SUCCESS;
|
||||
}
|
||||
|
||||
/** Verification function for vector reductions*/
|
||||
AcResult
|
||||
acVerifyVecReductions(const AcMesh model, const AcVecReductionTestCase* testCases,
|
||||
const size_t numCases)
|
||||
{
|
||||
printf("\nTesting vector reductions:\n");
|
||||
|
||||
bool errors_found = false;
|
||||
for (size_t i = 0; i < numCases; i++) {
|
||||
AcReal model_reduction = acModelReduceVec(model, testCases[i].rtype, testCases[i].a,
|
||||
testCases[i].b, testCases[i].c);
|
||||
Error error = acGetError(model_reduction, testCases[i].candidate);
|
||||
strncpy(error.label, testCases[i].label, ERROR_LABEL_LENGTH - 1);
|
||||
error.label[ERROR_LABEL_LENGTH - 1] = '\0';
|
||||
errors_found |= printErrorToScreen(error);
|
||||
}
|
||||
printf("%s\n", errors_found ? "Failure. Found errors in one or more vector reductions"
|
||||
: "Success. No errors found.");
|
||||
|
||||
return errors_found ? AC_FAILURE : AC_SUCCESS;
|
||||
}
|
||||
|
||||
/** Constructor for scalar reduction test case */
|
||||
AcScalReductionTestCase
|
||||
acCreateScalReductionTestCase(const char* label, const VertexBufferHandle vtxbuf, const ReductionType rtype)
|
||||
{
|
||||
AcScalReductionTestCase testCase;
|
||||
|
||||
strncpy(testCase.label,label,ERROR_LABEL_LENGTH - 1);
|
||||
testCase.label[ERROR_LABEL_LENGTH - 1] = '\0';
|
||||
testCase.vtxbuf = vtxbuf;
|
||||
testCase.rtype = rtype;
|
||||
testCase.candidate = 0;
|
||||
|
||||
return testCase;
|
||||
}
|
||||
|
||||
/** Constructor for vector reduction test case */
|
||||
AcVecReductionTestCase
|
||||
acCreateVecReductionTestCase(const char* label, const VertexBufferHandle a,
|
||||
const VertexBufferHandle b, const VertexBufferHandle c, const ReductionType rtype)
|
||||
{
|
||||
AcVecReductionTestCase testCase;
|
||||
|
||||
strncpy(testCase.label,label,ERROR_LABEL_LENGTH - 1);
|
||||
testCase.label[ERROR_LABEL_LENGTH - 1] = '\0';
|
||||
testCase.a = a;
|
||||
testCase.b = b;
|
||||
testCase.c = c;
|
||||
testCase.rtype = rtype;
|
||||
testCase.candidate = 0;
|
||||
|
||||
return testCase;
|
||||
}
|
||||
|
Reference in New Issue
Block a user