Merge branch 'master' into sink_20190723

Conflicts:
	acc/mhd_solver/stencil_defines.h
	src/standalone/simulation.cc
This commit is contained in:
JackHsu
2019-08-08 12:17:28 +08:00
21 changed files with 2034 additions and 1207 deletions

View File

@@ -21,9 +21,12 @@ der6x_upwd(in Scalar vertex)
return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
- Scalar(20.0)* vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z]
+ Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
- Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
+ vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
+ Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z]
+ vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
- Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z]
+ vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
+ vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z]
+ vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
}
Preprocessed Scalar
@@ -33,9 +36,12 @@ der6y_upwd(in Scalar vertex)
return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
-Scalar( 20.0)* vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z]
+Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
-Scalar( 6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
+ vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
+Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z]
+ vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
-Scalar( 6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z]
+ vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
+ vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z]
+ vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
}
Preprocessed Scalar
@@ -45,9 +51,12 @@ der6z_upwd(in Scalar vertex)
return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
-Scalar( 20.0)* vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z]
+Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
-Scalar( 6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
+Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1]
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
-Scalar( 6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2]
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3]
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
}
#endif

View File

@@ -309,23 +309,26 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
// JP: This looks wrong:
// 1) Should it be dsx * nx instead of dsx * ny?
// 2) Should you also use globalGrid.n instead of the local n?
// MV: You are rigth. Made a quickfix. I did not see the error because multigpu is split
// in z direction not y direction.
// 3) Also final point: can we do this with vectors/quaternions instead?
// Tringonometric functions are much more expensive and inaccurate/
// MV: Good idea. No an immediate priority.
// Fun related article:
// https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
xx.x = xx.x*(2.0*M_PI/(dsx*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
xx.y = xx.y*(2.0*M_PI/(dsy*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
xx.z = xx.z*(2.0*M_PI/(dsz*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
xx.x = xx.x*(2.0*M_PI/(dsx*globalGridN.x));
xx.y = xx.y*(2.0*M_PI/(dsy*globalGridN.y));
xx.z = xx.z*(2.0*M_PI/(dsz*globalGridN.z));
Scalar cos_phi = cos(phi);
Scalar sin_phi = sin(phi);
Scalar cos_k_dox_x = cos(dot(k_force, xx));
Scalar sin_k_dox_x = sin(dot(k_force, xx));
Scalar cos_k_dot_x = cos(dot(k_force, xx));
Scalar sin_k_dot_x = sin(dot(k_force, xx));
// Phase affect only the x-component
//Scalar real_comp = cos_k_dox_x;
//Scalar imag_comp = sin_k_dox_x;
Scalar real_comp_phase = cos_k_dox_x*cos_phi - sin_k_dox_x*sin_phi;
Scalar imag_comp_phase = cos_k_dox_x*sin_phi + sin_k_dox_x*cos_phi;
//Scalar real_comp = cos_k_dot_x;
//Scalar imag_comp = sin_k_dot_x;
Scalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
Scalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;
Vector force = (Vector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
@@ -338,9 +341,9 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
Vector
forcing(int3 globalVertexIdx, Scalar dt)
{
Vector a = Scalar(.5) * (Vector){globalGrid.n.x * dsx,
globalGrid.n.y * dsy,
globalGrid.n.z * dsz}; // source (origin)
Vector a = Scalar(.5) * (Vector){globalGridN.x * dsx,
globalGridN.y * dsy,
globalGridN.z * dsz}; // source (origin)
Vector xx = (Vector){(globalVertexIdx.x - nx_min) * dsx,
(globalVertexIdx.y - ny_min) * dsy,
(globalVertexIdx.z - nz_min) * dsz}; // sink (current index)

2
doc/manual/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*.html
*.pdf

View File

@@ -24,95 +24,62 @@ extern "C" {
#include "astaroth_defines.h"
/** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more,
* AC_FAILURE otherwise. */
AcResult acCheckDeviceAvailability(void);
#include "astaroth_device.h"
#include "astaroth_node.h"
/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL
* passed as a parameter */
AcResult acSynchronizeStream(const StreamType stream);
/*
#include "astaroth_grid.h"
#define acInit(x) acGridInit(x)
#define acQuit() acGridQuit()
#define acLoad(x) acGridLoadMesh(STREAM_DEFAULT, x)
#define acReduceScal(x, y) acGridReduceScal(STREAM_DEFAULT, x, y)
#define acReduceVec(x, y, z, w) acGridReduceVec(STREAM_DEFAULT, x, y, z, w)
#define acBoundcondStep() acGridPeriodicBoundcondStep(STREAM_DEFAULT)
#define acIntegrate(x) acGridIntegrateStep(STREAM_DEFAULT, x)
#define acStore(x) acGridStoreMesh(STREAM_DEFAULT, x)
#define acSynchronizeStream(x) acGridSynchronizeStream(x)
#define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
*/
/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos
* of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */
AcResult acSynchronizeMesh(void);
/** Starting point of all GPU computation. Handles the allocation and
initialization of *all memory needed on all GPUs in the node*. In other words,
setups everything GPU-side so that calling any other GPU interface function
afterwards does not result in illegal memory accesses. */
/** Allocates all memory and initializes the devices visible to the caller. Should be
* called before any other function in this interface. */
AcResult acInit(const AcMeshInfo mesh_info);
/** Frees all GPU allocations and resets all devices in the node. Should be
* called at exit. */
AcResult acQuit(void);
/** Does all three substeps of the RK3 integration and computes the boundary
conditions when necessary. The result is synchronized and the boundary conditions are applied
after the final substep, after which the result can be fetched to CPU memory with acStore. */
/** Synchronizes a specific stream. All streams are synchronized if STREAM_ALL is passed as a
* parameter*/
AcResult acSynchronizeStream(const Stream stream);
/** Loads a constant to the memories of the devices visible to the caller */
AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
/** Loads an AcMesh to the devices visible to the caller */
AcResult acLoad(const AcMesh host_mesh);
/** Stores the AcMesh distributed among the devices visible to the caller back to the host*/
AcResult acStore(AcMesh* host_mesh);
/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
* substep and the user is responsible for calling acBoundcondStep before reading the data. */
AcResult acIntegrate(const AcReal dt);
/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the
* whole computational domain, which must be up to date and synchronized before calling
* acReduceScal.
*/
AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle a);
/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to the
* caller*/
AcResult acBoundcondStep(void);
/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the
* whole computational domain, which must be up to date and synchronized before calling
* acReduceVec.
*/
/** Does a scalar reduction with the data stored in some vertex buffer */
AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
/** Does a vector reduction with vertex buffers where the vector components are (a, b, c) */
AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
const VertexBufferHandle b, const VertexBufferHandle c);
/** Distributes the host mesh among the GPUs in the node. Synchronous. */
AcResult acLoad(const AcMesh host_mesh);
/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous.
*/
AcResult acStore(AcMesh* host_mesh);
/*
* =============================================================================
* Astaroth interface: Advanced functions. Asynchronous.
* =============================================================================
*/
/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value,
const StreamType stream);
/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 start, const int num_vertices);
AcResult acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 start, const int num_vertices,
const StreamType stream);
/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
* CPU memory. Asynchronous.
*/
AcResult acStoreWithOffset(const int3 start, const int num_vertices, AcMesh* host_mesh);
AcResult acStoreWithOffsetAsync(const int3 start, const int num_vertices, AcMesh* host_mesh,
const StreamType stream);
/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
AcResult acIntegrateStep(const int isubstep, const AcReal dt);
AcResult acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream);
/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
* Asynchronous.*/
AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start,
const int3 end);
AcResult acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
const int3 end, const StreamType stream);
/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
AcResult acBoundcondStep(void);
AcResult acBoundcondStepAsync(const StreamType stream);
/*
* =============================================================================
* Revised interface
* =============================================================================
/** Stores a subset of the mesh stored across the devices visible to the caller back to host memory.
*/
AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh);
#ifdef __cplusplus
} // extern "C"

View File

@@ -86,7 +86,9 @@ typedef struct {
FUNC(AC_nxy),\
FUNC(AC_nxyz),\
#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)
#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)\
FUNC(AC_global_grid_n),\
FUNC(AC_multigpu_offset),
#define AC_FOR_BUILTIN_REAL_PARAM_TYPES(FUNC)
@@ -106,9 +108,26 @@ typedef enum {
typedef enum {
STREAM_DEFAULT,
NUM_STREAM_TYPES, //
STREAM_ALL
} StreamType;
STREAM_0,
STREAM_1,
STREAM_2,
STREAM_3,
STREAM_4,
STREAM_5,
STREAM_6,
STREAM_7,
STREAM_8,
STREAM_9,
STREAM_10,
STREAM_11,
STREAM_12,
STREAM_13,
STREAM_14,
STREAM_15,
STREAM_16,
NUM_STREAM_TYPES
} Stream;
#define STREAM_ALL (NUM_STREAM_TYPES)
#define AC_GEN_ID(X) X
typedef enum {

127
include/astaroth_device.h Normal file
View File

@@ -0,0 +1,127 @@
/*
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include "astaroth_defines.h"
typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
// in Vulkan, f.ex. VkDevice
/** */
AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device);
/** */
AcResult acDeviceDestroy(Device device);
/** */
AcResult acDevicePrintInfo(const Device device);
/** */
AcResult acDeviceAutoOptimize(const Device device);
/** */
AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
/** */
AcResult acDeviceSwapBuffers(const Device device);
/** */
AcResult acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
const AcReal value);
/** */
AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream,
const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices);
/** Deprecated */
AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
const AcMesh host_mesh, const int3 src, const int3 dst,
const int num_vertices);
/** */
AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle);
/** Deprecated */
AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh);
/** */
AcResult acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices,
AcMesh* host_mesh);
/** Deprecated */
AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh);
/** */
AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
/** Deprecated */
AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
/** */
AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle,
const int3 src, const int3 dst,
const int num_vertices, Device dst_device);
/** Deprecated */
AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
const int3 src, const int3 dst, const int num_vertices,
Device* dst_device);
/** */
AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, Device dst_device);
/** Deprecated */
AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
/** */
AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt);
/** */
AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 start,
const int3 end);
/** */
AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end);
/** */
AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
/** */
AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
#ifdef __cplusplus
} // extern "C"
#endif

99
include/astaroth_grid.h Normal file
View File

@@ -0,0 +1,99 @@
/*
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include "astaroth_defines.h"
/** */
AcResult acGridInit(const AcMeshInfo node_config);
/** */
AcResult acGridQuit(void);
/** */
AcResult acGridSynchronizeStream(const Stream stream);
/** */
AcResult acGridSwapBuffers(void);
/** */
AcResult acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value);
/** */
AcResult acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices);
/** */
AcResult acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
const int3 dst, const int num_vertices);
/** */
AcResult acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle);
/** */
AcResult acGridLoadMesh(const Stream stream, const AcMesh host_mesh);
/** */
AcResult acGridStoreVertexBufferWithOffset(const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices,
AcMesh* host_mesh);
/** */
AcResult acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
const int num_vertices, AcMesh* host_mesh);
/** */
AcResult acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
AcMesh* host_mesh);
/** */
AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
/** */
AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start,
const int3 end, const AcReal dt);
/** */
AcResult acGridIntegrateStep(const Stream stream, const AcReal dt);
/** */
AcResult acGridPeriodicBoundcondStep(const Stream stream);
/** */
/* acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result); */
AcReal acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle);
/** */
/*AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
const VertexBufferHandle vec2, AcReal* result);*/
AcReal acGridReduceVec(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
const VertexBufferHandle vec2);
#ifdef __cplusplus
} // extern "C"
#endif

124
include/astaroth_node.h Normal file
View File

@@ -0,0 +1,124 @@
/*
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include "astaroth_defines.h"
typedef struct node_s* Node; // Opaque pointer to node_s.
typedef struct {
} DeviceConfiguration;
/** */
AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
/** */
AcResult acNodeDestroy(Node node);
/** */
AcResult acNodePrintInfo(const Node node);
/** */
AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
/** */
AcResult acNodeAutoOptimize(const Node node);
/** */
AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
/** Deprecated ? */
AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle); // Not in Device
/** */
AcResult acNodeSynchronizeMesh(const Node node, const Stream stream); // Not in Device
/** */
AcResult acNodeSwapBuffers(const Node node);
/** */
AcResult acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
const AcReal value);
/** Deprecated ? Might be useful though if the user wants to load only one vtxbuf. But in this case
* the user should supply a AcReal* instead of vtxbuf_handle */
AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices);
/** */
AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
const int3 src, const int3 dst, const int num_vertices);
/** Deprecated ? */
AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle);
/** */
AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh);
/** Deprecated ? */
AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices,
AcMesh* host_mesh);
/** */
AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh);
/** Deprecated ? */
AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
/** */
AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh);
/** */
AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt);
/** */
AcResult acNodeIntegrate(const Node node, const AcReal dt);
/** */
AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle);
/** */
AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
/** */
AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
/** */
AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result);
#ifdef __cplusplus
} // extern "C"
#endif

View File

@@ -1,51 +0,0 @@
#!/bin/bash
# Run this in your build directory (cd build && ../scripts/auto_optimize.sh)
# Generates a ${BENCHMARK_FILE} which contains the threadblock dims and other
# constants used in the integration in addition to the time used.
MAX_THREADS=1024 # Max size of the thread block, depends on hardware
BENCHMARK_FILE="benchmark.out"
TBCONFCREATOR_SRC_PATH="../scripts/gen_rk3_threadblockconf.c"
TBCONFFILE_DST_PATH="../src/core/kernels"
C_COMPILER_NAME="gcc"
rm ${BENCHMARK_FILE}
for (( tz=2; tz<=8; tz*=2))
do
for (( ty=1; ty<=1; ty+=1))
do
for (( tx=16; tx<=64; tx*=2))
do
if ( (${tx}*${ty}*${tz}) > ${MAX_THREADS})
then break
fi
for (( launch_bound=1; launch_bound<=8; launch_bound*=2))
do
for (( elems_per_thread=1; elems_per_thread<=128; elems_per_thread*=2))
do
# Generate the threadblock configuration
${C_COMPILER_NAME} ${TBCONFCREATOR_SRC_PATH} -o gen_rk3_threadblockconf
./gen_rk3_threadblockconf ${tx} ${ty} ${tz} ${elems_per_thread} ${launch_bound}
rm gen_rk3_threadblockconf
mv rk3_threadblock.conf ${TBCONFFILE_DST_PATH}
# Compile and run the test build
cmake -DBUILD_DEBUG=OFF -DDOUBLE_PRECISION=OFF -DAUTO_OPTIMIZE=ON .. && make -j
#if ./ac_run -t; then
# echo Success
./ac_run -b
#else
# echo fail!
#fi
done
done
done
done
done

View File

@@ -5,10 +5,13 @@
# source ./sourceme.sh
# autotest.sh
#
# If you need slurm or to pass something before ./ac_run, set the variable
# If you need slurm or to pass something before ./ac_run, export the variable
# SRUN_COMMAND before calling this script.
#
# F.ex. on Taito SRUN_COMMAND="srun --gres=gpu:k80:4 --mem=24000 -t 00:14:59 -p gputest --cpus-per-task 1 -n 1"
# export SRUN_COMMAND
# autotest.sh
echo "SRUN_COMMAND: " ${SRUN_COMMAND}
results=()

View File

@@ -3,7 +3,8 @@
########################################
## Find packages
find_package(CUDA 9 REQUIRED)
find_package(CUDA REQUIRED)
# find_package(OpenMP REQUIRED)
## Architecture and optimization flags
set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
@@ -12,7 +13,8 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
-gencode arch=compute_61,code=sm_61
-lineinfo
-ftz=true # Flush denormalized floats to zero
-std=c++11)
-std=c++11
)# --compiler-options ${OpenMP_CXX_FLAGS})
#--maxrregcount=255
# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
# =cg to opt out
@@ -32,6 +34,6 @@ else ()
endif ()
## Create and link the library
include_directories(.)
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu)
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
target_include_directories(astaroth_core PRIVATE .)
target_link_libraries(astaroth_core m)

View File

@@ -16,119 +16,8 @@
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Multi-GPU implementation.
*
%JP: The old way for computing boundary conditions conflicts with the
way we have to do things with multiple GPUs.
The older approach relied on unified memory, which represented the whole
memory area as one huge mesh instead of several smaller ones. However, unified memory
in its current state is more meant for quick prototyping when performance is not an issue.
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
than when managing the memory explicitly.
In this new approach, I have simplified the multi- and single-GPU layers significantly.
Quick rundown:
New struct: Grid. There are two global variables, "grid" and "subgrid", which
contain the extents of the whole simulation domain and the decomposed grids,
respectively. To simplify thing, we require that each GPU is assigned the same amount of
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
work with.
The whole simulation domain is decomposed with respect to the z dimension.
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
contain (nx, ny, nz / num_devices) vertices.
An local index (i, j, k) in some subgrid can be mapped to the global grid with
global idx = (i, j, k + device_id * subgrid.n.z)
Terminology:
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
Changes required to this commented code block:
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
device.cu instead. Same holds for any complex index calculations. Instead, the local
coordinates should be passed as an int3 type without having to consider how the data is
actually laid out in device memory
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
handle of type "Device" which should be passed to single-GPU functions. In this file, all
devices are stored in a global array "devices[num_devices]".
- Every single-GPU function is executed asynchronously by default such that we
can optimize Astaroth by executing memory transactions concurrently with
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
as a parameter and commands executing in different streams can be processed
in parallel/concurrently.
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
With multiple GPUs, periodic boundary conditions applied on indices ranging from
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
STENCIL_ORDER/2)
on a single device are "local", in the sense that they can be computed without
having to exchange data with neighboring GPUs. Special care is needed only for transferring
the data to the fron and back plates outside this range. In the solution we use
here, we solve the local boundaries first, and then just exchange the front and back plates
in a "ring", like so
device_id
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
### Throughout this file we use the following notation and names for various index offsets
Global coordinates: coordinates with respect to the global grid (static Grid grid)
Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
s0, s1: source indices in global coordinates
d0, d1: destination indices in global coordinates
da = max(s0, d0);
db = min(s1, d1);
These are used in at least
acLoad()
acStore()
acSynchronizeHalos()
Here we decompose the host mesh and distribute it among the GPUs in
the node.
The host mesh is a huge contiguous block of data. Its dimensions are given by
the global variable named "grid". A "grid" is decomposed into "subgrids",
one for each GPU. Here we check which parts of the range s0...s1 maps
to the memory space stored by some GPU, ranging d0...d1, and transfer
the data if needed.
The index mapping is inherently quite involved, but here's a picture which
hopefully helps make sense out of all this.
Grid
|----num_vertices---|
xxx|....................................................|xxx
^ ^ ^ ^
d0 d1 s0 (src) s1
Subgrid
xxx|.............|xxx
^ ^
d0 d1
^ ^
db da
*
*/
// #include "astaroth_defines.h"
#include "astaroth.h"
#include "errchk.h"
#include "device.cuh"
#include "math_utils.h" // sum for reductions
// #include "standalone/config_loader.h" // update_config
#define AC_GEN_STR(X) #X
const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
@@ -142,532 +31,80 @@ const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) /
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
#undef AC_GEN_STR
static const int MAX_NUM_DEVICES = 32;
static int num_devices = 0;
static Device devices[MAX_NUM_DEVICES] = {};
static Grid grid; // A grid consists of num_devices subgrids
static Grid subgrid;
static int
gridIdx(const Grid grid, const int3 idx)
{
return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
}
static int3
gridIdx3d(const Grid grid, const int idx)
{
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
idx / (grid.m.x * grid.m.y)};
}
static void
printInt3(const int3 vec)
{
printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
}
static inline void
print(const AcMeshInfo config)
{
for (int i = 0; i < NUM_INT_PARAMS; ++i)
printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
for (int i = 0; i < NUM_REAL_PARAMS; ++i)
printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
}
static void
update_builtin_params(AcMeshInfo* config)
{
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
///////////// PAD TEST
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
///////////// PAD TEST
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
config->int_params[AC_nx_min] = NGHOST;
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
config->int_params[AC_ny_min] = NGHOST;
config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
config->int_params[AC_nz_min] = NGHOST;
config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
/* Additional helper params */
// Int helpers
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
#if VERBOSE_PRINTING // Defined in astaroth.h
printf("###############################################################\n");
printf("Config dimensions recalculated:\n");
print(*config);
printf("###############################################################\n");
#endif
}
static Grid
createGrid(const AcMeshInfo config)
{
Grid grid;
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
return grid;
}
static const int num_nodes = 1;
static Node nodes[num_nodes];
AcResult
acCheckDeviceAvailability(void)
acInit(const AcMeshInfo mesh_info)
{
int device_count; // Separate from num_devices to avoid side effects
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&device_count));
if (device_count > 0)
return AC_SUCCESS;
else
return AC_FAILURE;
}
AcResult
acSynchronizeStream(const StreamType stream)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
synchronize(devices[i], stream);
}
return AC_SUCCESS;
}
static AcResult
synchronize_halos(const StreamType stream)
{
// Exchanges the halos of subgrids
// After this step, the data within the main grid ranging from
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
// has been synchronized and transferred to appropriate subgrids
// We loop only to num_devices - 1 since the front and back plate of the grid is not
// transferred because their contents depend on the boundary conditions.
// IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
// I.e. the halos of subgrids must contain up-to-date data!
// #pragma omp parallel for
for (int i = 0; i < num_devices - 1; ++i) {
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst,
num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acSynchronizeMesh(void)
{
acSynchronizeStream(STREAM_ALL);
synchronize_halos(STREAM_DEFAULT);
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acInit(const AcMeshInfo config)
{
// Get num_devices
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
if (num_devices < 1) {
ERROR("No CUDA devices found!");
return AC_FAILURE;
}
if (num_devices > MAX_NUM_DEVICES) {
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
num_devices = MAX_NUM_DEVICES;
}
if (!AC_MULTIGPU_ENABLED) {
WARNING("MULTIGPU_ENABLED was false. Using only one device");
num_devices = 1; // Use only one device if multi-GPU is not enabled
}
// Check that num_devices is divisible with AC_nz. This makes decomposing the
// problem domain to multiple GPUs much easier since we do not have to worry
// about remainders
ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
// Decompose the problem domain
// The main grid
grid = createGrid(config);
// Subgrids
AcMeshInfo subgrid_config = config;
subgrid_config.int_params[AC_nz] /= num_devices;
update_builtin_params(&subgrid_config);
subgrid = createGrid(subgrid_config);
// Periodic boundary conditions become weird if the system can "fold unto itself".
ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
#if VERBOSE_PRINTING
// clang-format off
printf("Grid m "); printInt3(grid.m); printf("\n");
printf("Grid n "); printInt3(grid.n); printf("\n");
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
// clang-format on
#endif
// Initialize the devices
for (int i = 0; i < num_devices; ++i) {
createDevice(i, subgrid_config, &devices[i]);
loadGlobalGrid(devices[i], grid);
printDeviceInfo(devices[i]);
}
// Enable peer access
for (int i = 0; i < num_devices; ++i) {
const int front = (i + 1) % num_devices;
const int back = (i - 1 + num_devices) % num_devices;
int can_access_front, can_access_back;
cudaDeviceCanAccessPeer(&can_access_front, i, front);
cudaDeviceCanAccessPeer(&can_access_back, i, back);
#if VERBOSE_PRINTING
printf(
"Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
i, front, can_access_front, back, can_access_back);
#endif
cudaSetDevice(i);
if (can_access_front) {
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
}
if (can_access_back) {
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
}
}
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
return acNodeCreate(0, mesh_info, &nodes[0]);
}
AcResult
acQuit(void)
{
acSynchronizeStream(STREAM_ALL);
for (int i = 0; i < num_devices; ++i) {
destroyDevice(devices[i]);
}
return AC_SUCCESS;
return acNodeDestroy(nodes[0]);
}
AcResult
acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
const int3 end, const StreamType stream)
acSynchronizeStream(const Stream stream)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
// DECOMPOSITION OFFSET HERE
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
const int3 da = max(start, d0);
const int3 db = min(end, d1);
if (db.z >= da.z) {
const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
rkStep(devices[i], stream, isubstep, da_local, db_local, dt);
}
}
return AC_SUCCESS;
return acNodeSynchronizeStream(nodes[0], stream);
}
AcResult
acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end)
acLoadDeviceConstant(const AcRealParam param, const AcReal value)
{
return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
return acNodeLoadConstant(nodes[0], STREAM_DEFAULT, param, value);
}
AcResult
acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream)
acLoad(const AcMesh host_mesh)
{
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = start + grid.n;
return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream);
return acNodeLoadMesh(nodes[0], STREAM_DEFAULT, host_mesh);
}
AcResult
acIntegrateStep(const int isubstep, const AcReal dt)
acStore(AcMesh* host_mesh)
{
return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
}
static AcResult
local_boundcondstep(const StreamType stream)
{
if (num_devices == 1) {
boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m);
}
else {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
boundcondStep(devices[i], stream, d0, d1);
}
}
return AC_SUCCESS;
}
static AcResult
global_boundcondstep(const StreamType stream)
{
if (num_devices > 1) {
// With periodic boundary conditions we exchange the front and back plates of the
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
// ...|ooooxxx|... -> xxx|ooooooo|...
{
const int3 src = (int3){0, 0, subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst,
num_vertices);
}
// ...|ooooooo|xxx <- ...|xxxoooo|...
{
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst,
num_vertices);
}
}
return AC_SUCCESS;
}
AcResult
acBoundcondStepAsync(const StreamType stream)
{
ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES);
local_boundcondstep(stream);
acSynchronizeStream(stream);
global_boundcondstep(stream);
synchronize_halos(stream);
acSynchronizeStream(stream);
return AC_SUCCESS;
}
AcResult
acBoundcondStep(void)
{
return acBoundcondStepAsync(STREAM_DEFAULT);
}
static AcResult
swap_buffers(void)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
swapBuffers(devices[i]);
}
return AC_SUCCESS;
return acNodeStoreMesh(nodes[0], STREAM_DEFAULT, host_mesh);
}
AcResult
acIntegrate(const AcReal dt)
{
acSynchronizeStream(STREAM_ALL);
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
swap_buffers();
acBoundcondStep();
}
return AC_SUCCESS;
/*
acNodeIntegrate(nodes[0], dt);
return acBoundcondStep();
*/
return acNodeIntegrate(nodes[0], dt);
}
static AcReal
simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n)
AcResult
acBoundcondStep(void)
{
AcReal res = results[0];
for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) {
res = max(res, results[i]);
}
else if (rtype == RTYPE_MIN) {
res = min(res, results[i]);
}
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
res = sum(res, results[i]);
}
else {
ERROR("Invalid rtype");
}
}
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
res = sqrt(inv_n * res);
}
return res;
return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
}
AcReal
acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle)
acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
{
acSynchronizeStream(STREAM_ALL);
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
AcReal result;
acNodeReduceScal(nodes[0], STREAM_DEFAULT, rtype, vtxbuf_handle, &result);
return result;
}
AcReal
acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b,
const VertexBufferHandle c)
{
acSynchronizeStream(STREAM_ALL);
AcReal results[num_devices];
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
}
return simple_final_reduce_scal(rtype, results, num_devices);
AcReal result;
acNodeReduceVec(nodes[0], STREAM_DEFAULT, rtype, a, b, c, &result);
return result;
}
AcResult
acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices,
const StreamType stream)
acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
const int3 s0 = src;
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
const int3 da = max(s0, d0);
const int3 db = min(s1, d1);
/*
printf("Device %d\n", i);
printf("\ts0: "); printInt3(s0); printf("\n");
printf("\td0: "); printInt3(d0); printf("\n");
printf("\tda: "); printInt3(da); printf("\n");
printf("\tdb: "); printInt3(db); printf("\n");
printf("\td1: "); printInt3(d1); printf("\n");
printf("\ts1: "); printInt3(s1); printf("\n");
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
*/
if (db.z >= da.z) {
const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
// DECOMPOSITION OFFSET HERE
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
// printf("\n");
copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells);
}
// printf("\n");
}
return AC_SUCCESS;
return acNodeStoreMeshWithOffset(nodes[0], STREAM_DEFAULT, dst, dst, num_vertices, host_mesh);
}
AcResult
acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices)
{
return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
}
AcResult
acLoad(const AcMesh host_mesh)
{
acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info));
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh,
const StreamType stream)
{
// See the beginning of the file for an explanation of the index mapping
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
const int3 s0 = src;
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
const int3 da = max(s0, d0);
const int3 db = min(s1, d1);
if (db.z >= da.z) {
const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
// DECOMPOSITION OFFSET HERE
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh);
}
}
return AC_SUCCESS;
}
AcResult
acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh)
{
return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
}
AcResult
acStore(AcMesh* host_mesh)
{
acStoreWithOffset((int3){0, 0, 0}, acVertexBufferSize(host_mesh->info), host_mesh);
acSynchronizeStream(STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream)
{
// #pragma omp parallel for
for (int i = 0; i < num_devices; ++i) {
loadDeviceConstant(devices[i], stream, param, value);
}
return AC_SUCCESS;
}
AcResult
acLoadDeviceConstant(const AcRealParam param, const AcReal value)
{
return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
}
/*
* =============================================================================
* Revised interface
* =============================================================================
*/

View File

@@ -24,7 +24,7 @@
* Detailed info.
*
*/
#include "device.cuh"
#include "astaroth_device.h"
#include "errchk.h"
@@ -40,19 +40,19 @@ typedef struct {
} VertexBufferArray;
__constant__ AcMeshInfo d_mesh_info;
__constant__ int3 d_multigpu_offset;
__constant__ Grid globalGrid;
#define DCONST_INT(X) (d_mesh_info.int_params[X])
#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
#include "kernels/boundconds.cuh"
#include "kernels/integration.cuh"
#include "kernels/reductions.cuh"
static dim3 rk3_tpb = (dim3){32, 1, 4};
static dim3 rk3_tpb(32, 1, 4);
#if PACKED_DATA_TRANSFERS // Defined in device.cuh
// #include "kernels/pack_unpack.cuh"
@@ -76,8 +76,96 @@ struct device_s {
#endif
};
// clang-format off
static __global__ void dummy_kernel(void) {}
// clang-format on
AcResult
printDeviceInfo(const Device device)
acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
{
cudaSetDevice(id);
cudaDeviceReset();
// Create Device
struct device_s* device = (struct device_s*)malloc(sizeof(*device));
ERRCHK_ALWAYS(device);
device->id = id;
device->local_config = device_config;
// Check that the code was compiled for the proper GPU architecture
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
"device supports the CUDA architecture you are compiling for.\n"
"Running dummy kernel... ");
fflush(stdout);
dummy_kernel<<<1, 1>>>();
ERRCHK_CUDA_KERNEL_ALWAYS();
printf("Success!\n");
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
}
// Memory
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
}
ERRCHK_CUDA_ALWAYS(
cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
#if PACKED_DATA_TRANSFERS
// Allocate data required for packed transfers here (cudaMalloc)
#endif
// Device constants
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
cudaMemcpyHostToDevice));
printf("Created device %d (%p)\n", device->id, device);
*device_handle = device;
// Autoptimize
if (id == 0) {
acDeviceAutoOptimize(device);
}
return AC_SUCCESS;
}
AcResult
acDeviceDestroy(Device device)
{
cudaSetDevice(device->id);
printf("Destroying device %d (%p)\n", device->id, device);
// Memory
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
cudaFree(device->vba.in[i]);
cudaFree(device->vba.out[i]);
}
cudaFree(device->reduce_scratchpad);
cudaFree(device->reduce_result);
#if PACKED_DATA_TRANSFERS
// Free data required for packed tranfers here (cudaFree)
#endif
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamDestroy(device->streams[i]);
}
// Destroy Device
free(device);
return AC_SUCCESS;
}
AcResult
acDevicePrintInfo(const Device device)
{
const int device_id = device->id;
@@ -136,305 +224,8 @@ printDeviceInfo(const Device device)
return AC_SUCCESS;
}
static __global__ void
dummy_kernel(void)
{
}
AcResult
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
{
cudaSetDevice(id);
cudaDeviceReset();
// Create Device
struct device_s* device = (struct device_s*)malloc(sizeof(*device));
ERRCHK_ALWAYS(device);
device->id = id;
device->local_config = device_config;
// Check that the code was compiled for the proper GPU architecture
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
"device supports the CUDA architecture you are compiling for.\n"
"Running dummy kernel... ");
fflush(stdout);
dummy_kernel<<<1, 1>>>();
ERRCHK_CUDA_KERNEL_ALWAYS();
printf("Success!\n");
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
}
// Memory
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
}
ERRCHK_CUDA_ALWAYS(
cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
#if PACKED_DATA_TRANSFERS
// Allocate data required for packed transfers here (cudaMalloc)
#endif
// Device constants
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
cudaMemcpyHostToDevice));
// Multi-GPU offset. This is used to compute globalVertexIdx.
// Might be better to calculate this in astaroth.cu instead of here, s.t.
// everything related to the decomposition is limited to the multi-GPU layer
const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
printf("Created device %d (%p)\n", device->id, device);
*device_handle = device;
// Autoptimize
if (id == 0)
autoOptimize(device);
return AC_SUCCESS;
}
AcResult
destroyDevice(Device device)
{
cudaSetDevice(device->id);
printf("Destroying device %d (%p)\n", device->id, device);
// Memory
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
cudaFree(device->vba.in[i]);
cudaFree(device->vba.out[i]);
}
cudaFree(device->reduce_scratchpad);
cudaFree(device->reduce_result);
#if PACKED_DATA_TRANSFERS
// Free data required for packed tranfers here (cudaFree)
#endif
// Concurrency
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
cudaStreamDestroy(device->streams[i]);
}
// Destroy Device
free(device);
return AC_SUCCESS;
}
AcResult
boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
{
cudaSetDevice(device->id);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
}
return AC_SUCCESS;
}
AcResult
reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
cudaSetDevice(device->id);
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min]};
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = reduce_scal(device->streams[stream_type], rtype, start, end,
device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
device->reduce_result);
return AC_SUCCESS;
}
AcResult
reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result)
{
cudaSetDevice(device->id);
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min]};
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS;
}
AcResult
rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
const int3& end, const AcReal dt)
{
cudaSetDevice(device->id);
// const dim3 tpb(32, 1, 4);
const dim3 tpb = rk3_tpb;
const int3 n = end - start;
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z)));
if (step_number == 0)
solve<0><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
else if (step_number == 1)
solve<1><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
else
solve<2><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
AcResult
synchronize(const Device device, const StreamType stream_type)
{
cudaSetDevice(device->id);
if (stream_type == STREAM_ALL) {
cudaDeviceSynchronize();
}
else {
cudaStreamSynchronize(device->streams[stream_type]);
}
return AC_SUCCESS;
}
static AcResult
loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
const size_t bytes, AcReal* dst)
{
cudaSetDevice(device->id);
ERRCHK_CUDA(
cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
return AC_SUCCESS;
}
static AcResult
storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
const size_t bytes, AcReal* dst)
{
cudaSetDevice(device->id);
ERRCHK_CUDA(
cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
return AC_SUCCESS;
}
AcResult
copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
const int3& src, const int3& dst, const int num_vertices)
{
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
}
return AC_SUCCESS;
}
AcResult
copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
const int num_vertices, AcMesh* host_mesh)
{
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
}
return AC_SUCCESS;
}
AcResult
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
Device dst_device, const int3& dst, const int num_vertices)
{
cudaSetDevice(src_device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
&src_device->vba.in[i][src_idx], src_device->id,
sizeof(src_device->vba.in[i][0]) * num_vertices,
src_device->streams[stream_type]));
}
return AC_SUCCESS;
}
AcResult
swapBuffers(const Device device)
{
cudaSetDevice(device->id);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
AcReal* tmp = device->vba.in[i];
device->vba.in[i] = device->vba.out[i];
device->vba.out[i] = tmp;
}
return AC_SUCCESS;
}
AcResult
loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param,
const int value)
{
cudaSetDevice(device->id);
// CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here)
// and something like d_mesh_info.real_params[] cannot be directly accessed.
// Therefore we have to obfuscate the code a bit and compute the offset address before
// invoking cudaMemcpyToSymbol.
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice,
device->streams[stream_type]));
return AC_SUCCESS;
}
AcResult
loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param,
const AcReal value)
{
cudaSetDevice(device->id);
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice,
device->streams[stream_type]));
return AC_SUCCESS;
}
AcResult
loadGlobalGrid(const Device device, const Grid grid)
{
cudaSetDevice(device->id);
ERRCHK_CUDA_ALWAYS(
cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
return AC_SUCCESS;
}
AcResult
autoOptimize(const Device device)
acDeviceAutoOptimize(const Device device)
{
cudaSetDevice(device->id);
@@ -513,12 +304,303 @@ autoOptimize(const Device device)
return AC_SUCCESS;
}
AcResult
acDeviceSynchronizeStream(const Device device, const Stream stream)
{
cudaSetDevice(device->id);
if (stream == STREAM_ALL) {
cudaDeviceSynchronize();
}
else {
cudaStreamSynchronize(device->streams[stream]);
}
return AC_SUCCESS;
}
AcResult
acDeviceSwapBuffers(const Device device)
{
cudaSetDevice(device->id);
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
AcReal* tmp = device->vba.in[i];
device->vba.in[i] = device->vba.out[i];
device->vba.out[i] = tmp;
}
return AC_SUCCESS;
}
AcResult
acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
const AcReal value)
{
cudaSetDevice(device->id);
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
cudaMemcpyHostToDevice, device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices)
{
cudaSetDevice(device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &device->vba.in[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA( //
cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
);
return AC_SUCCESS;
}
AcResult
acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
const int3 src, const int3 dst, const int num_vertices)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
dst, num_vertices);
}
return AC_SUCCESS;
}
AcResult
acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle)
{
const int3 src = (int3){0, 0, 0};
const int3 dst = src;
const size_t num_vertices = acVertexBufferSize(device->local_config);
acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst,
num_vertices);
return AC_SUCCESS;
}
AcResult
acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
}
return AC_SUCCESS;
}
AcResult
acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
cudaSetDevice(device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA( //
cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
);
return AC_SUCCESS;
}
AcResult
acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
num_vertices, host_mesh);
}
return AC_SUCCESS;
}
AcResult
acDeviceStoreVertexBuffer(const Device device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
{
int3 src = (int3){0, 0, 0};
int3 dst = src;
const size_t num_vertices = acVertexBufferSize(device->local_config);
acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices,
host_mesh);
return AC_SUCCESS;
}
AcResult
acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
}
return AC_SUCCESS;
}
AcResult
acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices, Device dst_device)
{
cudaSetDevice(src_device->id);
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx];
AcReal* dst_ptr = &dst_device->vba.in[vtxbuf_handle][dst_idx];
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes,
src_device->streams[stream]));
return AC_SUCCESS;
}
AcResult
acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src,
const int3 dst, const int num_vertices, Device dst_device)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst,
num_vertices, dst_device);
}
return AC_SUCCESS;
}
AcResult
acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
const VertexBufferHandle vtxbuf_handle, Device dst_device)
{
int3 src = (int3){0, 0, 0};
int3 dst = src;
const size_t num_vertices = acVertexBufferSize(src_device->local_config);
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
num_vertices, dst_device);
return AC_SUCCESS;
}
AcResult
acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device)
{
WARNING("This function is deprecated");
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device);
}
return AC_SUCCESS;
}
AcResult
acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
const int3 start, const int3 end, const AcReal dt)
{
cudaSetDevice(device->id);
const dim3 tpb = rk3_tpb;
const int3 n = end - start;
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
(unsigned int)ceil(n.z / AcReal(tpb.z)));
if (step_number == 0)
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
else if (step_number == 1)
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
else
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
AcResult
acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type,
const VertexBufferHandle vtxbuf_handle, const int3 start,
const int3 end)
{
cudaSetDevice(device->id);
const cudaStream_t stream = device->streams[stream_type];
const dim3 tpb(8, 2, 8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, device->vba.in[vtxbuf_handle]);
ERRCHK_CUDA_KERNEL();
return AC_SUCCESS;
}
AcResult
acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
const int3 end)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
}
return AC_SUCCESS;
}
AcResult
acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
cudaSetDevice(device->id);
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min]};
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS;
}
AcResult
acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
const VertexBufferHandle vtxbuf2, AcReal* result)
{
cudaSetDevice(device->id);
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
device->local_config.int_params[AC_ny_min],
device->local_config.int_params[AC_nz_min]};
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS;
}
#if PACKED_DATA_TRANSFERS
// Functions for calling packed data transfers
#endif
/*
* =============================================================================
* Revised interface
* =============================================================================
*/

View File

@@ -1,107 +0,0 @@
/*
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Brief info.
*
* Detailed info.
*
*/
#pragma once
#include "astaroth.h"
typedef struct {
int3 m;
int3 n;
} Grid;
typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
// in Vulkan, f.ex. VkDevice
/** */
AcResult printDeviceInfo(const Device device);
/** */
AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
/** */
AcResult destroyDevice(Device device);
/** */
AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
const int3& end);
/** */
AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
/** */
AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
const VertexBufferHandle vec2, AcReal* result);
/** */
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
const int3& start, const int3& end, const AcReal dt);
/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
a StreamType, the function synchronizes all streams on the device. */
AcResult synchronize(const Device device, const StreamType stream_type);
/** */
AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
const AcMesh& host_mesh, const int3& src, const int3& dst,
const int num_vertices);
/** */
AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
const int3& dst, const int num_vertices, AcMesh* host_mesh);
/** */
AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
Device dst, const int3& dst_idx, const int num_vertices);
/** Swaps the input/output buffers used in computations */
AcResult swapBuffers(const Device device);
/** */
AcResult loadDeviceConstant(const Device device, const StreamType stream_type,
const AcIntParam param, const int value);
/** */
AcResult loadDeviceConstant(const Device device, const StreamType stream_type,
const AcRealParam param, const AcReal value);
/** */
AcResult loadGlobalGrid(const Device device, const Grid grid);
/** */
AcResult autoOptimize(const Device device);
// #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers
#if PACKED_DATA_TRANSFERS
// Declarations used for packed data transfers
#endif
/*
* =============================================================================
* Revised interface
* =============================================================================
*/

208
src/core/grid.cu Normal file
View File

@@ -0,0 +1,208 @@
/*
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
#include "astaroth_grid.h"
#include "astaroth_node.h"
const size_t MAX_NUM_NODES = 32;
size_t num_nodes = 0;
static Node nodes[MAX_NUM_NODES];
/** */
AcResult
acGridInit(const AcMeshInfo node_config)
{
acNodeCreate(0, node_config, &nodes[0]);
++num_nodes;
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridQuit(void)
{
acNodeDestroy(nodes[0]);
--num_nodes;
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridSynchronizeStream(const Stream stream)
{
for (int i = 0; i < num_nodes; ++i) {
acNodeSynchronizeStream(nodes[i], stream);
}
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridSwapBuffers(void)
{
for (int i = 0; i < num_nodes; ++i) {
acNodeSwapBuffers(nodes[i]);
}
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
{
for (int i = 0; i < num_nodes; ++i) {
acNodeLoadConstant(node, stream, param, value);
}
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices)
{
for (int i = 0; i < num_nodes; ++i) {
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle)
}
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
const int3 dst, const int num_vertices)
{
for (int i = 0; i < num_nodes; ++i) {
acNodeLoadMeshWithOffset(nodes[i], stream, host_mesh, src, dst, num_vertices);
}
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle)
{
for (int i = 0; i < num_nodes; ++i) {
acNodeLoadVertexBuffer(node, stream, host_mesh, vtxbuf_handle);
}
WARNING("Proper multinode not yet implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridLoadMesh(const Stream stream, const AcMesh host_mesh)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle,
const int3 src, const int3 dst, const int num_vertices,
AcMesh* host_mesh)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
const int num_vertices, AcMesh* host_mesh)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
AcMesh* host_mesh)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end,
const AcReal dt)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridIntegrateStep(const Stream stream, const AcReal dt)
{
WARNING("Not implemented");
return AC_FAILURE;
}
/** */
AcResult
acGridPeriodicBoundcondStep(const Stream stream)
{
WARNING("Not implemented");
return AC_FAILURE;
}
#if 0
/** */
AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result);
/** */
AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
const VertexBufferHandle vec2, AcReal* result);
#endif
/** */
AcResult
acGridReduceScal(const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
return 0;
}
/** */
AcResult
acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0,
const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result)
{
return 0;
}

818
src/core/node.cu Normal file
View File

@@ -0,0 +1,818 @@
/*
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
This file is part of Astaroth.
Astaroth is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Astaroth is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @file
* \brief Multi-GPU implementation.
*
%JP: The old way for computing boundary conditions conflicts with the
way we have to do things with multiple GPUs.
The older approach relied on unified memory, which represented the whole
memory area as one huge mesh instead of several smaller ones. However, unified memory
in its current state is more meant for quick prototyping when performance is not an issue.
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
than when managing the memory explicitly.
In this new approach, I have simplified the multi- and single-GPU layers significantly.
Quick rundown:
New struct: Grid. There are two global variables, "grid" and "subgrid", which
contain the extents of the whole simulation domain and the decomposed grids,
respectively. To simplify thing, we require that each GPU is assigned the same amount of
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
work with.
The whole simulation domain is decomposed with respect to the z dimension.
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
contain (nx, ny, nz / num_devices) vertices.
An local index (i, j, k) in some subgrid can be mapped to the global grid with
global idx = (i, j, k + device_id * subgrid.n.z)
Terminology:
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
Changes required to this commented code block:
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
device.cu instead. Same holds for any complex index calculations. Instead, the local
coordinates should be passed as an int3 type without having to consider how the data is
actually laid out in device memory
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
handle of type "Device" which should be passed to single-GPU functions. In this file, all
devices are stored in a global array "devices[num_devices]".
- Every single-GPU function is executed asynchronously by default such that we
can optimize Astaroth by executing memory transactions concurrently with
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
as a parameter and commands executing in different streams can be processed
in parallel/concurrently.
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
With multiple GPUs, periodic boundary conditions applied on indices ranging from
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
STENCIL_ORDER/2)
on a single device are "local", in the sense that they can be computed without
having to exchange data with neighboring GPUs. Special care is needed only for transferring
the data to the fron and back plates outside this range. In the solution we use
here, we solve the local boundaries first, and then just exchange the front and back plates
in a "ring", like so
device_id
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
### Throughout this file we use the following notation and names for various index offsets
Global coordinates: coordinates with respect to the global grid (static Grid grid)
Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
s0, s1: source indices in global coordinates
d0, d1: destination indices in global coordinates
da = max(s0, d0);
db = min(s1, d1);
These are used in at least
acLoad()
acStore()
acSynchronizeHalos()
Here we decompose the host mesh and distribute it among the GPUs in
the node.
The host mesh is a huge contiguous block of data. Its dimensions are given by
the global variable named "grid". A "grid" is decomposed into "subgrids",
one for each GPU. Here we check which parts of the range s0...s1 maps
to the memory space stored by some GPU, ranging d0...d1, and transfer
the data if needed.
The index mapping is inherently quite involved, but here's a picture which
hopefully helps make sense out of all this.
Grid
|----num_vertices---|
xxx|....................................................|xxx
^ ^ ^ ^
d0 d1 s0 (src) s1
Subgrid
xxx|.............|xxx
^ ^
d0 d1
^ ^
db da
*
*/
#include "astaroth_node.h"
#include "astaroth_device.h"
#include "errchk.h"
#include "math_utils.h" // sum for reductions
static const int MAX_NUM_DEVICES = 32;
typedef struct {
int3 m;
int3 n;
} Grid;
struct node_s {
int id;
int num_devices;
Device devices[MAX_NUM_DEVICES];
Grid grid;
Grid subgrid;
AcMeshInfo config;
};
static int
gridIdx(const Grid grid, const int3 idx)
{
return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
}
static int3
gridIdx3d(const Grid grid, const int idx)
{
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
idx / (grid.m.x * grid.m.y)};
}
static void
printInt3(const int3 vec)
{
printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
}
static inline void
print(const AcMeshInfo config)
{
for (int i = 0; i < NUM_INT_PARAMS; ++i)
printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
for (int i = 0; i < NUM_REAL_PARAMS; ++i)
printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
}
static void
update_builtin_params(AcMeshInfo* config)
{
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
///////////// PAD TEST
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
///////////// PAD TEST
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
config->int_params[AC_nx_min] = NGHOST;
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
config->int_params[AC_ny_min] = NGHOST;
config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
config->int_params[AC_nz_min] = NGHOST;
config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
/* Additional helper params */
// Int helpers
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
}
static Grid
createGrid(const AcMeshInfo config)
{
Grid grid;
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
return grid;
}
AcResult
acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
{
struct node_s* node = (struct node_s*)malloc(sizeof(*node));
node->id = id;
node->config = node_config;
// Get node->num_devices
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices));
if (node->num_devices < 1) {
ERROR("No CUDA devices found!");
return AC_FAILURE;
}
if (node->num_devices > MAX_NUM_DEVICES) {
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
node->num_devices = MAX_NUM_DEVICES;
}
if (!AC_MULTIGPU_ENABLED) {
WARNING("MULTIGPU_ENABLED was false. Using only one device");
node->num_devices = 1; // Use only one device if multi-GPU is not enabled
}
// Check that node->num_devices is divisible with AC_nz. This makes decomposing the
// problem domain to multiple GPUs much easier since we do not have to worry
// about remainders
ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0);
// Decompose the problem domain
// The main grid
node->grid = createGrid(node->config);
// Subgrids
AcMeshInfo subgrid_config = node->config;
subgrid_config.int_params[AC_nz] /= node->num_devices;
update_builtin_params(&subgrid_config);
#if VERBOSE_PRINTING // Defined in astaroth.h
printf("###############################################################\n");
printf("Config dimensions recalculated:\n");
print(subgrid_config);
printf("###############################################################\n");
#endif
node->subgrid = createGrid(subgrid_config);
// Periodic boundary conditions become weird if the system can "fold unto itself".
ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER);
ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);
#if VERBOSE_PRINTING
// clang-format off
printf("Grid m "); printInt3(node->grid.m); printf("\n");
printf("Grid n "); printInt3(node->grid.n); printf("\n");
printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
// clang-format on
#endif
// Initialize the devices
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 multinode_offset = (int3){0, 0, 0}; // Placeholder
const int3 multigpu_offset = (int3){0, 0, i * node->subgrid.n.z};
subgrid_config.int3_params[AC_global_grid_n] = node->grid.n;
subgrid_config.int3_params[AC_multigpu_offset] = multinode_offset + multigpu_offset;
acDeviceCreate(i, subgrid_config, &node->devices[i]);
acDevicePrintInfo(node->devices[i]);
}
// Enable peer access
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int front = (i + 1) % node->num_devices;
const int back = (i - 1 + node->num_devices) % node->num_devices;
int can_access_front, can_access_back;
cudaDeviceCanAccessPeer(&can_access_front, i, front);
cudaDeviceCanAccessPeer(&can_access_back, i, back);
#if VERBOSE_PRINTING
printf(
"Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
i, front, can_access_front, back, can_access_back);
#endif
cudaSetDevice(i);
if (can_access_front) {
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
}
if (can_access_back) {
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
}
}
acNodeSynchronizeStream(node, STREAM_ALL);
*node_handle = node;
return AC_SUCCESS;
}
AcResult
acNodeDestroy(Node node)
{
acNodeSynchronizeStream(node, STREAM_ALL);
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceDestroy(node->devices[i]);
}
free(node);
return AC_SUCCESS;
}
AcResult
acNodePrintInfo(const Node node)
{
(void)node;
WARNING("Not implemented");
return AC_FAILURE;
}
AcResult
acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
{
(void)node;
(void)config;
WARNING("Not implemented");
return AC_FAILURE;
}
AcResult
acNodeAutoOptimize(const Node node)
{
(void)node;
WARNING("Not implemented");
return AC_FAILURE;
}
AcResult
acNodeSynchronizeStream(const Node node, const Stream stream)
{
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceSynchronizeStream(node->devices[i], stream);
}
return AC_SUCCESS;
}
AcResult
acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle)
{
acNodeSynchronizeStream(node, stream);
// Exchanges the halos of subgrids
// After this step, the data within the main grid ranging from
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
// has been synchronized and transferred to appropriate subgrids
// We loop only to node->num_devices - 1 since the front and back plate of the grid is not
// transferred because their contents depend on the boundary conditions.
// IMPORTANT NOTE: the boundary conditions must be applied before
// callingacNodeSynchronizeStream(node, this function! I.e. the halos of subgrids must contain
// up-to-date data!
const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
// #pragma omp parallel for
for (int i = 0; i < node->num_devices - 1; ++i) {
// ...|ooooxxx|... -> xxx|ooooooo|...
const int3 src = (int3){0, 0, node->subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
const Device src_device = node->devices[i];
Device dst_device = node->devices[i + 1];
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
num_vertices, dst_device);
}
// #pragma omp parallel for
for (int i = 1; i < node->num_devices; ++i) {
// ...|ooooooo|xxx <- ...|xxxoooo|...
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
const Device src_device = node->devices[i];
Device dst_device = node->devices[i - 1];
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
num_vertices, dst_device);
}
return AC_SUCCESS;
}
AcResult
acNodeSynchronizeMesh(const Node node, const Stream stream)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i);
}
return AC_SUCCESS;
}
AcResult
acNodeSwapBuffers(const Node node)
{
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceSwapBuffers(node->devices[i]);
}
return AC_SUCCESS;
}
AcResult
acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
const AcReal value)
{
acNodeSynchronizeStream(node, stream);
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceLoadConstant(node->devices[i], stream, param, value);
}
return AC_SUCCESS;
}
AcResult
acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices)
{
acNodeSynchronizeStream(node, stream);
// See the beginning of the file for an explanation of the index mapping
// // #pragma omp parallel for
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
const int3 s0 = src; // dst; // TODO fix
(void)dst; // TODO fix
const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
const int3 da = max(s0, d0);
const int3 db = min(s1, d1);
/*
printf("Device %d\n", i);
printf("\ts0: "); printInt3(s0); printf("\n");
printf("\td0: "); printInt3(d0); printf("\n");
printf("\tda: "); printInt3(da); printf("\n");
printf("\tdb: "); printInt3(db); printf("\n");
printf("\td1: "); printInt3(d1); printf("\n");
printf("\ts1: "); printInt3(s1); printf("\n");
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
*/
if (db.z >= da.z) {
const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
// DECOMPOSITION OFFSET HERE
const int3 da_global = da; // src + da - dst; // TODO fix
const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
// printf("\n");
acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle,
da_global, da_local, copy_cells);
}
// printf("\n");
}
return AC_SUCCESS;
}
AcResult
acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
const int3 src, const int3 dst, const int num_vertices)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst,
num_vertices);
}
return AC_SUCCESS;
}
AcResult
acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
const VertexBufferHandle vtxbuf_handle)
{
const int3 src = (int3){0, 0, 0};
const int3 dst = src;
const size_t num_vertices = acVertexBufferSize(host_mesh.info);
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle, src, dst,
num_vertices);
return AC_SUCCESS;
}
AcResult
acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acNodeLoadVertexBuffer(node, stream, host_mesh, (VertexBufferHandle)i);
}
return AC_SUCCESS;
}
AcResult
acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, const int3 src,
const int3 dst, const int num_vertices, AcMesh* host_mesh)
{
acNodeSynchronizeStream(node, stream);
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
const int3 s0 = src; // TODO fix
(void)dst; // TODO fix
const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
const int3 da = max(s0, d0);
const int3 db = min(s1, d1);
if (db.z >= da.z) {
const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
// DECOMPOSITION OFFSET HERE
const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
const int3 da_global = da; // dst + da - src; // TODO fix
acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local,
da_global, copy_cells, host_mesh);
}
}
return AC_SUCCESS;
}
AcResult
acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
const int num_vertices, AcMesh* host_mesh)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acNodeStoreVertexBufferWithOffset(node, stream, (VertexBufferHandle)i, src, dst,
num_vertices, host_mesh);
}
return AC_SUCCESS;
}
AcResult
acNodeStoreVertexBuffer(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
{
const int3 src = (int3){0, 0, 0};
const int3 dst = src;
const size_t num_vertices = acVertexBufferSize(host_mesh->info);
acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices,
host_mesh);
return AC_SUCCESS;
}
AcResult
acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acNodeStoreVertexBuffer(node, stream, (VertexBufferHandle)i, host_mesh);
}
return AC_SUCCESS;
}
AcResult
acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start,
const int3 end, const AcReal dt)
{
acNodeSynchronizeStream(node, stream);
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
// DECOMPOSITION OFFSET HERE
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};
const int3 da = max(start, d0);
const int3 db = min(end, d1);
if (db.z >= da.z) {
const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};
const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};
acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt);
}
}
return AC_SUCCESS;
}
static AcResult
local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf)
{
acNodeSynchronizeStream(node, stream);
if (node->num_devices > 1) {
// Local boundary conditions
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
}
}
else {
acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
node->subgrid.m);
}
return AC_SUCCESS;
}
static AcResult
global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
{
acNodeSynchronizeStream(node, stream);
if (node->num_devices > 1) {
const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
{
// ...|ooooxxx|... -> xxx|ooooooo|...
const int3 src = (int3){0, 0, node->subgrid.n.z};
const int3 dst = (int3){0, 0, 0};
const Device src_device = node->devices[node->num_devices - 1];
Device dst_device = node->devices[0];
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
num_vertices, dst_device);
}
{
// ...|ooooooo|xxx <- ...|xxxoooo|...
const int3 src = (int3){0, 0, NGHOST};
const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
const Device src_device = node->devices[0];
Device dst_device = node->devices[node->num_devices - 1];
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
num_vertices, dst_device);
}
}
return AC_SUCCESS;
}
AcResult
acNodeIntegrate(const Node node, const AcReal dt)
{
acNodeSynchronizeStream(node, STREAM_ALL);
// xxx|OOO OOOOOOOOO OOO|xxx
// ^ ^ ^ ^
// n0 n1 n2 n3
// const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
// const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
// const int3 n2 = node->grid.n;
// const int3 n3 = n0 + node->grid.n;
for (int isubstep = 0; isubstep < 3; ++isubstep) {
acNodeSynchronizeStream(node, STREAM_ALL);
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
}
acNodeSynchronizeStream(node, STREAM_ALL);
// Inner inner
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = node->subgrid.n;
acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
}
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
acNodeSynchronizeStream(node, (Stream)vtxbuf);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Front
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Back
const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Bottom
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Top
const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Left
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
}
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) { // Right
const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
node->subgrid.n.z - 2 * NGHOST};
acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
}
acNodeSwapBuffers(node);
}
acNodeSynchronizeStream(node, STREAM_ALL);
return AC_SUCCESS;
}
AcResult
acNodePeriodicBoundcondStep(const Node node, const Stream stream,
const VertexBufferHandle vtxbuf_handle)
{
local_boundcondstep(node, stream, vtxbuf_handle);
acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
// TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE
global_boundcondstep(node, stream, vtxbuf_handle);
// WARNING("Global boundconds should not be done here with multinode");
return AC_SUCCESS;
}
AcResult
acNodePeriodicBoundconds(const Node node, const Stream stream)
{
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
}
return AC_SUCCESS;
}
static AcReal
simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
const int& n)
{
AcReal res = results[0];
for (int i = 1; i < n; ++i) {
if (rtype == RTYPE_MAX) {
res = max(res, results[i]);
}
else if (rtype == RTYPE_MIN) {
res = min(res, results[i]);
}
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
res = sum(res, results[i]);
}
else {
ERROR("Invalid rtype");
}
}
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z);
res = sqrt(inv_n * res);
}
return res;
}
AcResult
acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
const VertexBufferHandle vtxbuf_handle, AcReal* result)
{
acNodeSynchronizeStream(node, STREAM_ALL);
AcReal results[node->num_devices];
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
}
*result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
return AC_SUCCESS;
}
AcResult
acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c,
AcReal* result)
{
acNodeSynchronizeStream(node, STREAM_ALL);
AcReal results[node->num_devices];
// #pragma omp parallel for
for (int i = 0; i < node->num_devices; ++i) {
acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
}
*result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
return AC_SUCCESS;
}

View File

@@ -29,13 +29,13 @@
#include <stdio.h>
#include "config_loader.h"
#include "src/core/math_utils.h"
#include "model/host_forcing.h"
#include "model/host_memory.h"
#include "model/host_timestep.h"
#include "model/model_boundconds.h"
#include "model/model_reduce.h"
#include "model/model_rk3.h"
#include "src/core/math_utils.h"
#include "src/core/errchk.h"
@@ -431,8 +431,9 @@ check_rk3(const AcMeshInfo& mesh_info)
acIntegrate(dt);
model_rk3(dt, model_mesh);
boundconds(model_mesh->info, model_mesh);
}
boundconds(model_mesh->info, model_mesh);
acBoundcondStep();
acStore(gpu_mesh);
bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);
@@ -726,6 +727,7 @@ run_autotest(void)
// Device integration step
acIntegrate(dt);
acBoundcondStep();
acStore(candidate_mesh);
// Check fields

View File

@@ -197,7 +197,6 @@ run_benchmark(void)
double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
mesh_info.int_params[AC_nx]);
acStore(mesh);
acQuit();
acmesh_destroy(mesh);
}
@@ -269,7 +268,6 @@ run_benchmark(void)
write_result_to_file(wallclock * 1e3f / steps);
#endif
acStore(mesh);
acQuit();
acmesh_destroy(mesh);

View File

@@ -43,6 +43,9 @@ typedef struct {
ModelScalar value;
ModelVector gradient;
ModelMatrix hessian;
#if LUPWD
ModelVector upwind;
#endif
} ModelScalarData;
typedef struct {
@@ -273,6 +276,53 @@ derzz(const int i, const int j, const int k, const ModelScalar* arr)
return second_derivative(pencil, get(AC_inv_dsz));
}
#if LUPWD
static inline ModelScalar
der6x_upwd(const int i, const int j, const int k, const ModelScalar* arr)
{
ModelScalar inv_ds = get(AC_inv_dsx);
return ModelScalar(1.0/60.0)*inv_ds* (
-ModelScalar( 20.0)* arr[IDX(i, j, k)]
+ModelScalar( 15.0)*(arr[IDX(i+1, j, k)]
+ arr[IDX(i-1, j, k)])
-ModelScalar( 6.0)*(arr[IDX(i+2, j, k)]
+ arr[IDX(i-2, j, k)])
+ arr[IDX(i+3, j, k)]
+ arr[IDX(i-3, j, k)]);
}
static inline ModelScalar
der6y_upwd(const int i, const int j, const int k, const ModelScalar* arr)
{
ModelScalar inv_ds = get(AC_inv_dsy);
return ModelScalar(1.0/60.0)*inv_ds* (
-ModelScalar( 20.0)* arr[IDX(i, j, k)]
+ModelScalar( 15.0)*(arr[IDX(i, j+1, k)]
+ arr[IDX(i, j-1, k)])
-ModelScalar( 6.0)*(arr[IDX(i, j+2, k)]
+ arr[IDX(i, j-2, k)])
+ arr[IDX(i, j+3, k)]
+ arr[IDX(i, j-3, k)]);
}
static inline ModelScalar
der6z_upwd(const int i, const int j, const int k, const ModelScalar* arr)
{
ModelScalar inv_ds = get(AC_inv_dsz);
return ModelScalar(1.0/60.0)*inv_ds* (
-ModelScalar( 20.0)* arr[IDX(i, j, k )]
+ModelScalar( 15.0)*(arr[IDX(i, j, k+1)]
+ arr[IDX(i, j, k-1)])
-ModelScalar( 6.0)*(arr[IDX(i, j, k+2)]
+ arr[IDX(i, j, k-2)])
+ arr[IDX(i, j, k+3)]
+ arr[IDX(i, j, k-3)]);
}
#endif
static inline ModelScalar
compute_value(const int i, const int j, const int k, const ModelScalar* arr)
{
@@ -285,6 +335,14 @@ compute_gradient(const int i, const int j, const int k, const ModelScalar* arr)
return (ModelVector){derx(i, j, k, arr), dery(i, j, k, arr), derz(i, j, k, arr)};
}
#if LUPWD
static inline ModelVector
compute_upwind(const int i, const int j, const int k, const ModelScalar* arr)
{
return (ModelVector){der6x_upwd(i, j, k, arr), der6y_upwd(i, j, k, arr), der6z_upwd(i, j, k, arr)};
}
#endif
static inline ModelMatrix
compute_hessian(const int i, const int j, const int k, const ModelScalar* arr)
{
@@ -309,6 +367,10 @@ read_data(const int i, const int j, const int k, ModelScalar* buf[], const int h
// diagonals with all arrays
data.hessian = compute_hessian(i, j, k, buf[handle]);
#if LUPWD
data.upwind = compute_upwind(i, j, k, buf[handle]);
#endif
return data;
}
@@ -354,6 +416,8 @@ gradients(const ModelVectorData& data)
return (ModelMatrix){gradient(data.x), gradient(data.y), gradient(data.z)};
}
/*
* =============================================================================
* Level 0.3 (Built-in functions available during the Stencil Processing Stage)
@@ -502,10 +566,27 @@ contract(const ModelMatrix& mat)
* Stencil Processing Stage (equations)
* =============================================================================
*/
#if LUPWD
ModelScalar
upwd_der6(const ModelVectorData& uu, const ModelScalarData& lnrho)
{
ModelScalar uux = fabs(value(uu).x);
ModelScalar uuy = fabs(value(uu).y);
ModelScalar uuz = fabs(value(uu).z);
return uux*lnrho.upwind.x + uuy*lnrho.upwind.y + uuz*lnrho.upwind.z;
}
#endif
static inline ModelScalar
continuity(const ModelVectorData& uu, const ModelScalarData& lnrho)
{
return -dot(value(uu), gradient(lnrho)) - divergence(uu);
return -dot(value(uu), gradient(lnrho))
#if LUPWD
//This is a corrective hyperdiffusion term for upwinding.
+ upwd_der6(uu, lnrho)
#endif
- divergence(uu);
}
static inline ModelScalar
@@ -679,23 +760,24 @@ helical_forcing(ModelScalar magnitude, ModelVector k_force, ModelVector xx, Mode
ModelVector ff_im, ModelScalar phi)
{
(void)magnitude; // WARNING: unused
xx.x = xx.x * (2.0l * M_PI / (get(AC_dsx) * (get(AC_ny_max) - get(AC_ny_min))));
xx.y = xx.y * (2.0l * M_PI / (get(AC_dsy) * (get(AC_ny_max) - get(AC_ny_min))));
xx.z = xx.z * (2.0l * M_PI / (get(AC_dsz) * (get(AC_ny_max) - get(AC_ny_min))));
xx.x = xx.x*(2.0*M_PI/(get(AC_dsx)*get(AC_nx)));
xx.y = xx.y*(2.0*M_PI/(get(AC_dsy)*get(AC_ny)));
xx.z = xx.z*(2.0*M_PI/(get(AC_dsz)*get(AC_nz)));
ModelScalar cosl_phi = cosl(phi);
ModelScalar sinl_phi = sinl(phi);
ModelScalar cosl_k_dox_x = cosl(dot(k_force, xx));
ModelScalar sinl_k_dox_x = sinl(dot(k_force, xx));
ModelScalar cos_phi = cosl(phi);
ModelScalar sin_phi = sinl(phi);
ModelScalar cos_k_dot_x = cosl(dot(k_force, xx));
ModelScalar sin_k_dot_x = sinl(dot(k_force, xx));
// Phase affect only the x-component
// ModelScalar real_comp = cosl_k_dox_x;
// ModelScalar imag_comp = sinl_k_dox_x;
ModelScalar real_comp_phase = cosl_k_dox_x * cosl_phi - sinl_k_dox_x * sinl_phi;
ModelScalar imag_comp_phase = cosl_k_dox_x * sinl_phi + sinl_k_dox_x * cosl_phi;
//Scalar real_comp = cos_k_dot_x;
//Scalar imag_comp = sin_k_dot_x;
ModelScalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
ModelScalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;
ModelVector force = (ModelVector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
ff_re.z * real_comp_phase - ff_im.z * imag_comp_phase};
ModelVector force = (ModelVector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
ff_re.y*real_comp_phase - ff_im.y*imag_comp_phase,
ff_re.z*real_comp_phase - ff_im.z*imag_comp_phase};
return force;
}

View File

@@ -32,13 +32,13 @@
#include <string.h> // memcpy
#include "config_loader.h"
#include "src/core/errchk.h"
#include "src/core/math_utils.h"
#include "model/host_forcing.h"
#include "model/host_memory.h"
#include "model/host_timestep.h"
#include "model/model_reduce.h"
#include "model/model_rk3.h"
#include "src/core/errchk.h"
#include "src/core/math_utils.h"
#include "timer_hires.h"
// Window
@@ -106,6 +106,7 @@ renderer_init(const int& mx, const int& my)
// Setup window
window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
window_width, window_height, SDL_WINDOW_SHOWN);
ERRCHK_ALWAYS(window);
// Setup SDL renderer
renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
@@ -409,9 +410,10 @@ run_renderer(void)
/* Render */
const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
if (timer_diff_sec >= desired_frame_time) {
// acStore(mesh);
const int num_vertices = mesh->info.int_params[AC_mxy];
const int3 dst = (int3){0, 0, k_slice};
acBoundcondStep();
// acStore(mesh);
acStoreWithOffset(dst, num_vertices, mesh);
acSynchronizeStream(STREAM_ALL);
renderer_draw(*mesh); // Bottleneck is here
@@ -421,7 +423,6 @@ run_renderer(void)
}
printf("Wallclock time %f s\n", double(timer_diff_nsec(wallclock) / 1e9f));
acStore(mesh);
acQuit();
acmesh_destroy(mesh);

View File

@@ -27,13 +27,13 @@
#include "run.h"
#include "config_loader.h"
#include "src/core/errchk.h"
#include "src/core/math_utils.h"
#include "model/host_forcing.h"
#include "model/host_memory.h"
#include "model/host_timestep.h"
#include "model/model_reduce.h"
#include "model/model_rk3.h"
#include "src/core/errchk.h"
#include "src/core/math_utils.h"
#include "timer_hires.h"
#include <string.h>
@@ -216,6 +216,7 @@ run_simulation(void)
write_mesh_info(&mesh_info);
print_diagnostics(0, AcReal(.0), t_step, diag_file);
acBoundcondStep();
acStore(mesh);
save_mesh(*mesh, 0, t_step);
@@ -229,13 +230,13 @@ run_simulation(void)
/* initialize random seed: */
srand(312256655);
//TODO_SINK. init_sink_particle()
// TODO_SINK. init_sink_particle()
// Initialize the basic variables of the sink particle to a suitable initial value.
// 1. Location of the particle
// 2. Mass of the particle
// (3. Velocity of the particle)
// This at the level of Host in this case.
// acUpdate_sink_particle() will do the similar trick to the device.
// This at the level of Host in this case.
// acUpdate_sink_particle() will do the similar trick to the device.
/* Step the simulation */
for (int i = 1; i < max_steps; ++i) {
@@ -247,28 +248,28 @@ run_simulation(void)
loadForcingParamsToDevice(forcing_params);
#endif
//TODO_SINK acUpdate_sink_particle()
// Update properties of the sing particle for acIntegrate(). Essentially:
// 1. Location of the particle
// 2. Mass of the particle
// (3. Velocity of the particle)
// 3. Velocity of the particle)
// These can be used for calculating he gravitational field.
// This is my first comment! by Jack
// This is my second comment! by Jack
acIntegrate(dt);
//TODO_SINK acAdvect_sink_particle()
// THIS IS OPTIONAL. We will start from unmoving particle.
// 1. Calculate the equation of motion for the sink particle.
// NOTE: Might require embedding with acIntegrate(dt).
// TODO_SINK acAdvect_sink_particle()
// THIS IS OPTIONAL. We will start from unmoving particle.
// 1. Calculate the equation of motion for the sink particle.
// NOTE: Might require embedding with acIntegrate(dt).
//TODO_SINK acAccrete_sink_particle()
// Calculate accretion of the sink particle from the surrounding medium
// 1. Transfer density into sink particle mass
// 2. Transfer momentum into sink particle
// (OPTIONAL: Affection the motion of the particle)
// NOTE: Might require embedding with acIntegrate(dt).
// This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference.
// TODO_SINK acAccrete_sink_particle()
// Calculate accretion of the sink particle from the surrounding medium
// 1. Transfer density into sink particle mass
// 2. Transfer momentum into sink particle
// (OPTIONAL: Affection the motion of the particle)
// NOTE: Might require embedding with acIntegrate(dt).
// This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference.
t_step += dt;
@@ -311,9 +312,10 @@ run_simulation(void)
assumed to be asynchronous, so the meshes must be also synchronized
before transferring the data to the CPU. Like so:
acSynchronize();
acBoundcondStep();
acStore(mesh);
*/
acBoundcondStep();
acStore(mesh);
save_mesh(*mesh, i, t_step);