Merge branch 'master' into sink_20190723
Conflicts: acc/mhd_solver/stencil_defines.h src/standalone/simulation.cc
This commit is contained in:
@@ -21,9 +21,12 @@ der6x_upwd(in Scalar vertex)
|
||||
|
||||
return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
|
||||
- Scalar(20.0)* vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z]
|
||||
+ Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
|
||||
- Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
|
||||
+ vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z] + vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
|
||||
+ Scalar(15.0)*(vertex[vertexIdx.x+1, vertexIdx.y, vertexIdx.z]
|
||||
+ vertex[vertexIdx.x-1, vertexIdx.y, vertexIdx.z])
|
||||
- Scalar( 6.0)*(vertex[vertexIdx.x+2, vertexIdx.y, vertexIdx.z]
|
||||
+ vertex[vertexIdx.x-2, vertexIdx.y, vertexIdx.z])
|
||||
+ vertex[vertexIdx.x+3, vertexIdx.y, vertexIdx.z]
|
||||
+ vertex[vertexIdx.x-3, vertexIdx.y, vertexIdx.z])};
|
||||
}
|
||||
|
||||
Preprocessed Scalar
|
||||
@@ -33,9 +36,12 @@ der6y_upwd(in Scalar vertex)
|
||||
|
||||
return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
|
||||
-Scalar( 20.0)* vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z]
|
||||
+Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
|
||||
-Scalar( 6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
|
||||
+ vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z] + vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
|
||||
+Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y+1, vertexIdx.z]
|
||||
+ vertex[vertexIdx.x, vertexIdx.y-1, vertexIdx.z])
|
||||
-Scalar( 6.0)*(vertex[vertexIdx.x, vertexIdx.y+2, vertexIdx.z]
|
||||
+ vertex[vertexIdx.x, vertexIdx.y-2, vertexIdx.z])
|
||||
+ vertex[vertexIdx.x, vertexIdx.y+3, vertexIdx.z]
|
||||
+ vertex[vertexIdx.x, vertexIdx.y-3, vertexIdx.z])};
|
||||
}
|
||||
|
||||
Preprocessed Scalar
|
||||
@@ -45,9 +51,12 @@ der6z_upwd(in Scalar vertex)
|
||||
|
||||
return (Scalar){ Scalar(1.0/60.0)*inv_ds* (
|
||||
-Scalar( 20.0)* vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z]
|
||||
+Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
|
||||
-Scalar( 6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
|
||||
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3] + vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
|
||||
+Scalar( 15.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+1]
|
||||
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-1])
|
||||
-Scalar( 6.0)*(vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+2]
|
||||
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-2])
|
||||
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z+3]
|
||||
+ vertex[vertexIdx.x, vertexIdx.y, vertexIdx.z-3])};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -309,23 +309,26 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
|
||||
// JP: This looks wrong:
|
||||
// 1) Should it be dsx * nx instead of dsx * ny?
|
||||
// 2) Should you also use globalGrid.n instead of the local n?
|
||||
// MV: You are rigth. Made a quickfix. I did not see the error because multigpu is split
|
||||
// in z direction not y direction.
|
||||
// 3) Also final point: can we do this with vectors/quaternions instead?
|
||||
// Tringonometric functions are much more expensive and inaccurate/
|
||||
// MV: Good idea. No an immediate priority.
|
||||
// Fun related article:
|
||||
// https://randomascii.wordpress.com/2014/10/09/intel-underestimates-error-bounds-by-1-3-quintillion/
|
||||
xx.x = xx.x*(2.0*M_PI/(dsx*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
|
||||
xx.y = xx.y*(2.0*M_PI/(dsy*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
|
||||
xx.z = xx.z*(2.0*M_PI/(dsz*(DCONST_INT(AC_ny_max) - DCONST_INT(AC_ny_min))));
|
||||
xx.x = xx.x*(2.0*M_PI/(dsx*globalGridN.x));
|
||||
xx.y = xx.y*(2.0*M_PI/(dsy*globalGridN.y));
|
||||
xx.z = xx.z*(2.0*M_PI/(dsz*globalGridN.z));
|
||||
|
||||
Scalar cos_phi = cos(phi);
|
||||
Scalar sin_phi = sin(phi);
|
||||
Scalar cos_k_dox_x = cos(dot(k_force, xx));
|
||||
Scalar sin_k_dox_x = sin(dot(k_force, xx));
|
||||
Scalar cos_k_dot_x = cos(dot(k_force, xx));
|
||||
Scalar sin_k_dot_x = sin(dot(k_force, xx));
|
||||
// Phase affect only the x-component
|
||||
//Scalar real_comp = cos_k_dox_x;
|
||||
//Scalar imag_comp = sin_k_dox_x;
|
||||
Scalar real_comp_phase = cos_k_dox_x*cos_phi - sin_k_dox_x*sin_phi;
|
||||
Scalar imag_comp_phase = cos_k_dox_x*sin_phi + sin_k_dox_x*cos_phi;
|
||||
//Scalar real_comp = cos_k_dot_x;
|
||||
//Scalar imag_comp = sin_k_dot_x;
|
||||
Scalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
|
||||
Scalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;
|
||||
|
||||
|
||||
Vector force = (Vector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
|
||||
@@ -338,9 +341,9 @@ helical_forcing(Scalar magnitude, Vector k_force, Vector xx, Vector ff_re, Vecto
|
||||
Vector
|
||||
forcing(int3 globalVertexIdx, Scalar dt)
|
||||
{
|
||||
Vector a = Scalar(.5) * (Vector){globalGrid.n.x * dsx,
|
||||
globalGrid.n.y * dsy,
|
||||
globalGrid.n.z * dsz}; // source (origin)
|
||||
Vector a = Scalar(.5) * (Vector){globalGridN.x * dsx,
|
||||
globalGridN.y * dsy,
|
||||
globalGridN.z * dsz}; // source (origin)
|
||||
Vector xx = (Vector){(globalVertexIdx.x - nx_min) * dsx,
|
||||
(globalVertexIdx.y - ny_min) * dsy,
|
||||
(globalVertexIdx.z - nz_min) * dsz}; // sink (current index)
|
||||
|
2
doc/manual/.gitignore
vendored
Normal file
2
doc/manual/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
*.html
|
||||
*.pdf
|
@@ -24,95 +24,62 @@ extern "C" {
|
||||
|
||||
#include "astaroth_defines.h"
|
||||
|
||||
/** Checks whether there are any CUDA devices available. Returns AC_SUCCESS if there is 1 or more,
|
||||
* AC_FAILURE otherwise. */
|
||||
AcResult acCheckDeviceAvailability(void);
|
||||
#include "astaroth_device.h"
|
||||
#include "astaroth_node.h"
|
||||
|
||||
/** Synchronizes the stream shared by all GPUs in the node. Synchronizes all streams if STREAM_ALL
|
||||
* passed as a parameter */
|
||||
AcResult acSynchronizeStream(const StreamType stream);
|
||||
/*
|
||||
#include "astaroth_grid.h"
|
||||
#define acInit(x) acGridInit(x)
|
||||
#define acQuit() acGridQuit()
|
||||
#define acLoad(x) acGridLoadMesh(STREAM_DEFAULT, x)
|
||||
#define acReduceScal(x, y) acGridReduceScal(STREAM_DEFAULT, x, y)
|
||||
#define acReduceVec(x, y, z, w) acGridReduceVec(STREAM_DEFAULT, x, y, z, w)
|
||||
#define acBoundcondStep() acGridPeriodicBoundcondStep(STREAM_DEFAULT)
|
||||
#define acIntegrate(x) acGridIntegrateStep(STREAM_DEFAULT, x)
|
||||
#define acStore(x) acGridStoreMesh(STREAM_DEFAULT, x)
|
||||
#define acSynchronizeStream(x) acGridSynchronizeStream(x)
|
||||
#define acLoadDeviceConstant(x, y) acGridLoadConstant(STREAM_DEFAULT, x, y)
|
||||
*/
|
||||
|
||||
/** Synchronizes the mesh distributed across multiple GPUs. Must be called if the data in the halos
|
||||
* of neighboring GPUs has been modified by an asynchronous function, f.ex. acBoundcondStep() */
|
||||
AcResult acSynchronizeMesh(void);
|
||||
|
||||
/** Starting point of all GPU computation. Handles the allocation and
|
||||
initialization of *all memory needed on all GPUs in the node*. In other words,
|
||||
setups everything GPU-side so that calling any other GPU interface function
|
||||
afterwards does not result in illegal memory accesses. */
|
||||
/** Allocates all memory and initializes the devices visible to the caller. Should be
|
||||
* called before any other function in this interface. */
|
||||
AcResult acInit(const AcMeshInfo mesh_info);
|
||||
|
||||
/** Frees all GPU allocations and resets all devices in the node. Should be
|
||||
* called at exit. */
|
||||
AcResult acQuit(void);
|
||||
|
||||
/** Does all three substeps of the RK3 integration and computes the boundary
|
||||
conditions when necessary. The result is synchronized and the boundary conditions are applied
|
||||
after the final substep, after which the result can be fetched to CPU memory with acStore. */
|
||||
/** Synchronizes a specific stream. All streams are synchronized if STREAM_ALL is passed as a
|
||||
* parameter*/
|
||||
AcResult acSynchronizeStream(const Stream stream);
|
||||
|
||||
/** Loads a constant to the memories of the devices visible to the caller */
|
||||
AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
|
||||
|
||||
/** Loads an AcMesh to the devices visible to the caller */
|
||||
AcResult acLoad(const AcMesh host_mesh);
|
||||
|
||||
/** Stores the AcMesh distributed among the devices visible to the caller back to the host*/
|
||||
AcResult acStore(AcMesh* host_mesh);
|
||||
|
||||
/** Performs Runge-Kutta 3 integration. Note: Boundary conditions are not applied after the final
|
||||
* substep and the user is responsible for calling acBoundcondStep before reading the data. */
|
||||
AcResult acIntegrate(const AcReal dt);
|
||||
|
||||
/** Performs a scalar reduction on all GPUs in the node and returns the result. Operates on the
|
||||
* whole computational domain, which must be up to date and synchronized before calling
|
||||
* acReduceScal.
|
||||
*/
|
||||
AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle a);
|
||||
/** Applies periodic boundary conditions for the Mesh distributed among the devices visible to the
|
||||
* caller*/
|
||||
AcResult acBoundcondStep(void);
|
||||
|
||||
/** Performs a vector reduction on all GPUs in the node and returns the result. Operates on the
|
||||
* whole computational domain, which must be up to date and synchronized before calling
|
||||
* acReduceVec.
|
||||
*/
|
||||
/** Does a scalar reduction with the data stored in some vertex buffer */
|
||||
AcReal acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle);
|
||||
|
||||
/** Does a vector reduction with vertex buffers where the vector components are (a, b, c) */
|
||||
AcReal acReduceVec(const ReductionType rtype, const VertexBufferHandle a,
|
||||
const VertexBufferHandle b, const VertexBufferHandle c);
|
||||
|
||||
/** Distributes the host mesh among the GPUs in the node. Synchronous. */
|
||||
AcResult acLoad(const AcMesh host_mesh);
|
||||
|
||||
/** Gathers the mesh stored across GPUs in the node and stores it back to host memory. Synchronous.
|
||||
*/
|
||||
AcResult acStore(AcMesh* host_mesh);
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Astaroth interface: Advanced functions. Asynchronous.
|
||||
* =============================================================================
|
||||
*/
|
||||
/** Loads a parameter to the constant memory of all GPUs in the node. Asynchronous. */
|
||||
AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
|
||||
AcResult acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value,
|
||||
const StreamType stream);
|
||||
|
||||
/** Splits a subset of the host_mesh and distributes it among the GPUs in the node. Asynchronous. */
|
||||
AcResult acLoadWithOffset(const AcMesh host_mesh, const int3 start, const int num_vertices);
|
||||
AcResult acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 start, const int num_vertices,
|
||||
const StreamType stream);
|
||||
|
||||
/** Gathers a subset of the data distributed among the GPUs in the node and stores the mesh back to
|
||||
* CPU memory. Asynchronous.
|
||||
*/
|
||||
AcResult acStoreWithOffset(const int3 start, const int num_vertices, AcMesh* host_mesh);
|
||||
AcResult acStoreWithOffsetAsync(const int3 start, const int num_vertices, AcMesh* host_mesh,
|
||||
const StreamType stream);
|
||||
|
||||
/** Performs a single RK3 step without computing boundary conditions. Asynchronous.*/
|
||||
AcResult acIntegrateStep(const int isubstep, const AcReal dt);
|
||||
AcResult acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream);
|
||||
|
||||
/** Performs a single RK3 step on a subset of the mesh without computing the boundary conditions.
|
||||
* Asynchronous.*/
|
||||
AcResult acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start,
|
||||
const int3 end);
|
||||
AcResult acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
|
||||
const int3 end, const StreamType stream);
|
||||
|
||||
/** Performs the boundary condition step on the GPUs in the node. Asynchronous. */
|
||||
AcResult acBoundcondStep(void);
|
||||
AcResult acBoundcondStepAsync(const StreamType stream);
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Revised interface
|
||||
* =============================================================================
|
||||
/** Stores a subset of the mesh stored across the devices visible to the caller back to host memory.
|
||||
*/
|
||||
AcResult acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
@@ -86,7 +86,9 @@ typedef struct {
|
||||
FUNC(AC_nxy),\
|
||||
FUNC(AC_nxyz),\
|
||||
|
||||
#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)
|
||||
#define AC_FOR_BUILTIN_INT3_PARAM_TYPES(FUNC)\
|
||||
FUNC(AC_global_grid_n),\
|
||||
FUNC(AC_multigpu_offset),
|
||||
|
||||
#define AC_FOR_BUILTIN_REAL_PARAM_TYPES(FUNC)
|
||||
|
||||
@@ -106,9 +108,26 @@ typedef enum {
|
||||
|
||||
typedef enum {
|
||||
STREAM_DEFAULT,
|
||||
NUM_STREAM_TYPES, //
|
||||
STREAM_ALL
|
||||
} StreamType;
|
||||
STREAM_0,
|
||||
STREAM_1,
|
||||
STREAM_2,
|
||||
STREAM_3,
|
||||
STREAM_4,
|
||||
STREAM_5,
|
||||
STREAM_6,
|
||||
STREAM_7,
|
||||
STREAM_8,
|
||||
STREAM_9,
|
||||
STREAM_10,
|
||||
STREAM_11,
|
||||
STREAM_12,
|
||||
STREAM_13,
|
||||
STREAM_14,
|
||||
STREAM_15,
|
||||
STREAM_16,
|
||||
NUM_STREAM_TYPES
|
||||
} Stream;
|
||||
#define STREAM_ALL (NUM_STREAM_TYPES)
|
||||
|
||||
#define AC_GEN_ID(X) X
|
||||
typedef enum {
|
||||
|
127
include/astaroth_device.h
Normal file
127
include/astaroth_device.h
Normal file
@@ -0,0 +1,127 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "astaroth_defines.h"
|
||||
|
||||
typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
|
||||
// in Vulkan, f.ex. VkDevice
|
||||
|
||||
/** */
|
||||
AcResult acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceDestroy(Device device);
|
||||
|
||||
/** */
|
||||
AcResult acDevicePrintInfo(const Device device);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceAutoOptimize(const Device device);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceSynchronizeStream(const Device device, const Stream stream);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceSwapBuffers(const Device device);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
|
||||
const AcReal value);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream,
|
||||
const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices);
|
||||
|
||||
/** Deprecated */
|
||||
AcResult acDeviceLoadMeshWithOffset(const Device device, const Stream stream,
|
||||
const AcMesh host_mesh, const int3 src, const int3 dst,
|
||||
const int num_vertices);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle);
|
||||
|
||||
/** Deprecated */
|
||||
AcResult acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices,
|
||||
AcMesh* host_mesh);
|
||||
|
||||
/** Deprecated */
|
||||
AcResult acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
|
||||
const int3 dst, const int num_vertices, AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceStoreVertexBuffer(const Device device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
|
||||
|
||||
/** Deprecated */
|
||||
AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle,
|
||||
const int3 src, const int3 dst,
|
||||
const int num_vertices, Device dst_device);
|
||||
|
||||
/** Deprecated */
|
||||
AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
|
||||
const int3 src, const int3 dst, const int num_vertices,
|
||||
Device* dst_device);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, Device dst_device);
|
||||
|
||||
/** Deprecated */
|
||||
AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
|
||||
const int3 start, const int3 end, const AcReal dt);
|
||||
/** */
|
||||
AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 start,
|
||||
const int3 end);
|
||||
|
||||
/** */
|
||||
AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
|
||||
const int3 end);
|
||||
|
||||
/** */
|
||||
AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||
/** */
|
||||
AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
99
include/astaroth_grid.h
Normal file
99
include/astaroth_grid.h
Normal file
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "astaroth_defines.h"
|
||||
|
||||
/** */
|
||||
AcResult acGridInit(const AcMeshInfo node_config);
|
||||
|
||||
/** */
|
||||
AcResult acGridQuit(void);
|
||||
|
||||
/** */
|
||||
AcResult acGridSynchronizeStream(const Stream stream);
|
||||
|
||||
/** */
|
||||
AcResult acGridSwapBuffers(void);
|
||||
|
||||
/** */
|
||||
AcResult acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value);
|
||||
|
||||
/** */
|
||||
AcResult acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices);
|
||||
|
||||
/** */
|
||||
AcResult acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
|
||||
const int3 dst, const int num_vertices);
|
||||
|
||||
/** */
|
||||
AcResult acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle);
|
||||
|
||||
/** */
|
||||
AcResult acGridLoadMesh(const Stream stream, const AcMesh host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acGridStoreVertexBufferWithOffset(const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices,
|
||||
AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
|
||||
const int num_vertices, AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
|
||||
AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acGridStoreMesh(const Stream stream, AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start,
|
||||
const int3 end, const AcReal dt);
|
||||
|
||||
/** */
|
||||
AcResult acGridIntegrateStep(const Stream stream, const AcReal dt);
|
||||
|
||||
/** */
|
||||
AcResult acGridPeriodicBoundcondStep(const Stream stream);
|
||||
/** */
|
||||
/* acGridReduceScal(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result); */
|
||||
AcReal acGridReduceScal(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle);
|
||||
/** */
|
||||
/*AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
|
||||
const VertexBufferHandle vec2, AcReal* result);*/
|
||||
AcReal acGridReduceVec(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
|
||||
const VertexBufferHandle vec2);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
124
include/astaroth_node.h
Normal file
124
include/astaroth_node.h
Normal file
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "astaroth_defines.h"
|
||||
|
||||
typedef struct node_s* Node; // Opaque pointer to node_s.
|
||||
|
||||
typedef struct {
|
||||
|
||||
} DeviceConfiguration;
|
||||
|
||||
/** */
|
||||
AcResult acNodeCreate(const int id, const AcMeshInfo node_config, Node* node);
|
||||
|
||||
/** */
|
||||
AcResult acNodeDestroy(Node node);
|
||||
|
||||
/** */
|
||||
AcResult acNodePrintInfo(const Node node);
|
||||
|
||||
/** */
|
||||
AcResult acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config);
|
||||
|
||||
/** */
|
||||
AcResult acNodeAutoOptimize(const Node node);
|
||||
|
||||
/** */
|
||||
AcResult acNodeSynchronizeStream(const Node node, const Stream stream);
|
||||
|
||||
/** Deprecated ? */
|
||||
AcResult acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle); // Not in Device
|
||||
|
||||
/** */
|
||||
AcResult acNodeSynchronizeMesh(const Node node, const Stream stream); // Not in Device
|
||||
|
||||
/** */
|
||||
AcResult acNodeSwapBuffers(const Node node);
|
||||
|
||||
/** */
|
||||
AcResult acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
|
||||
const AcReal value);
|
||||
|
||||
/** Deprecated ? Might be useful though if the user wants to load only one vtxbuf. But in this case
|
||||
* the user should supply a AcReal* instead of vtxbuf_handle */
|
||||
AcResult acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream,
|
||||
const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices);
|
||||
|
||||
/** */
|
||||
AcResult acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
|
||||
const int3 src, const int3 dst, const int num_vertices);
|
||||
|
||||
/** Deprecated ? */
|
||||
AcResult acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle);
|
||||
|
||||
/** */
|
||||
AcResult acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh);
|
||||
|
||||
/** Deprecated ? */
|
||||
AcResult acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices,
|
||||
AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src,
|
||||
const int3 dst, const int num_vertices, AcMesh* host_mesh);
|
||||
|
||||
/** Deprecated ? */
|
||||
AcResult acNodeStoreVertexBuffer(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
|
||||
const int3 start, const int3 end, const AcReal dt);
|
||||
|
||||
/** */
|
||||
AcResult acNodeIntegrate(const Node node, const AcReal dt);
|
||||
|
||||
/** */
|
||||
AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle);
|
||||
|
||||
/** */
|
||||
AcResult acNodePeriodicBoundconds(const Node node, const Stream stream);
|
||||
|
||||
/** */
|
||||
AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||
/** */
|
||||
AcResult acNodeReduceVec(const Node node, const Stream stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
@@ -1,51 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Run this in your build directory (cd build && ../scripts/auto_optimize.sh)
|
||||
# Generates a ${BENCHMARK_FILE} which contains the threadblock dims and other
|
||||
# constants used in the integration in addition to the time used.
|
||||
|
||||
MAX_THREADS=1024 # Max size of the thread block, depends on hardware
|
||||
|
||||
BENCHMARK_FILE="benchmark.out"
|
||||
TBCONFCREATOR_SRC_PATH="../scripts/gen_rk3_threadblockconf.c"
|
||||
TBCONFFILE_DST_PATH="../src/core/kernels"
|
||||
|
||||
C_COMPILER_NAME="gcc"
|
||||
|
||||
rm ${BENCHMARK_FILE}
|
||||
|
||||
for (( tz=2; tz<=8; tz*=2))
|
||||
do
|
||||
for (( ty=1; ty<=1; ty+=1))
|
||||
do
|
||||
for (( tx=16; tx<=64; tx*=2))
|
||||
do
|
||||
|
||||
if ( (${tx}*${ty}*${tz}) > ${MAX_THREADS})
|
||||
then break
|
||||
fi
|
||||
|
||||
for (( launch_bound=1; launch_bound<=8; launch_bound*=2))
|
||||
do
|
||||
for (( elems_per_thread=1; elems_per_thread<=128; elems_per_thread*=2))
|
||||
do
|
||||
# Generate the threadblock configuration
|
||||
${C_COMPILER_NAME} ${TBCONFCREATOR_SRC_PATH} -o gen_rk3_threadblockconf
|
||||
./gen_rk3_threadblockconf ${tx} ${ty} ${tz} ${elems_per_thread} ${launch_bound}
|
||||
rm gen_rk3_threadblockconf
|
||||
mv rk3_threadblock.conf ${TBCONFFILE_DST_PATH}
|
||||
|
||||
# Compile and run the test build
|
||||
cmake -DBUILD_DEBUG=OFF -DDOUBLE_PRECISION=OFF -DAUTO_OPTIMIZE=ON .. && make -j
|
||||
#if ./ac_run -t; then
|
||||
# echo Success
|
||||
./ac_run -b
|
||||
#else
|
||||
# echo fail!
|
||||
#fi
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
|
@@ -5,10 +5,13 @@
|
||||
# source ./sourceme.sh
|
||||
# autotest.sh
|
||||
#
|
||||
# If you need slurm or to pass something before ./ac_run, set the variable
|
||||
# If you need slurm or to pass something before ./ac_run, export the variable
|
||||
# SRUN_COMMAND before calling this script.
|
||||
#
|
||||
# F.ex. on Taito SRUN_COMMAND="srun --gres=gpu:k80:4 --mem=24000 -t 00:14:59 -p gputest --cpus-per-task 1 -n 1"
|
||||
# export SRUN_COMMAND
|
||||
# autotest.sh
|
||||
echo "SRUN_COMMAND: " ${SRUN_COMMAND}
|
||||
|
||||
results=()
|
||||
|
||||
|
@@ -3,7 +3,8 @@
|
||||
########################################
|
||||
|
||||
## Find packages
|
||||
find_package(CUDA 9 REQUIRED)
|
||||
find_package(CUDA REQUIRED)
|
||||
# find_package(OpenMP REQUIRED)
|
||||
|
||||
## Architecture and optimization flags
|
||||
set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
|
||||
@@ -12,7 +13,8 @@ set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
|
||||
-gencode arch=compute_61,code=sm_61
|
||||
-lineinfo
|
||||
-ftz=true # Flush denormalized floats to zero
|
||||
-std=c++11)
|
||||
-std=c++11
|
||||
)# --compiler-options ${OpenMP_CXX_FLAGS})
|
||||
#--maxrregcount=255
|
||||
# -Xptxas -dlcm=ca opt-in to cache all global loads to L1/texture cache
|
||||
# =cg to opt out
|
||||
@@ -32,6 +34,6 @@ else ()
|
||||
endif ()
|
||||
|
||||
## Create and link the library
|
||||
include_directories(.)
|
||||
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu)
|
||||
cuda_add_library(astaroth_core STATIC astaroth.cu device.cu node.cu)
|
||||
target_include_directories(astaroth_core PRIVATE .)
|
||||
target_link_libraries(astaroth_core m)
|
||||
|
@@ -16,119 +16,8 @@
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Multi-GPU implementation.
|
||||
*
|
||||
%JP: The old way for computing boundary conditions conflicts with the
|
||||
way we have to do things with multiple GPUs.
|
||||
|
||||
The older approach relied on unified memory, which represented the whole
|
||||
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
||||
in its current state is more meant for quick prototyping when performance is not an issue.
|
||||
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
|
||||
than when managing the memory explicitly.
|
||||
|
||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||
Quick rundown:
|
||||
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||
contain the extents of the whole simulation domain and the decomposed grids,
|
||||
respectively. To simplify thing, we require that each GPU is assigned the same amount of
|
||||
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
|
||||
work with.
|
||||
|
||||
The whole simulation domain is decomposed with respect to the z dimension.
|
||||
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||
contain (nx, ny, nz / num_devices) vertices.
|
||||
|
||||
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||
|
||||
Terminology:
|
||||
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
||||
|
||||
Changes required to this commented code block:
|
||||
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
|
||||
device.cu instead. Same holds for any complex index calculations. Instead, the local
|
||||
coordinates should be passed as an int3 type without having to consider how the data is
|
||||
actually laid out in device memory
|
||||
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
|
||||
handle of type "Device" which should be passed to single-GPU functions. In this file, all
|
||||
devices are stored in a global array "devices[num_devices]".
|
||||
- Every single-GPU function is executed asynchronously by default such that we
|
||||
can optimize Astaroth by executing memory transactions concurrently with
|
||||
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||
as a parameter and commands executing in different streams can be processed
|
||||
in parallel/concurrently.
|
||||
|
||||
|
||||
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||
|
||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||
|
||||
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
|
||||
STENCIL_ORDER/2)
|
||||
|
||||
on a single device are "local", in the sense that they can be computed without
|
||||
having to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||
the data to the fron and back plates outside this range. In the solution we use
|
||||
here, we solve the local boundaries first, and then just exchange the front and back plates
|
||||
in a "ring", like so
|
||||
device_id
|
||||
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||
|
||||
### Throughout this file we use the following notation and names for various index offsets
|
||||
|
||||
Global coordinates: coordinates with respect to the global grid (static Grid grid)
|
||||
Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
|
||||
|
||||
s0, s1: source indices in global coordinates
|
||||
d0, d1: destination indices in global coordinates
|
||||
da = max(s0, d0);
|
||||
db = min(s1, d1);
|
||||
|
||||
These are used in at least
|
||||
acLoad()
|
||||
acStore()
|
||||
acSynchronizeHalos()
|
||||
|
||||
Here we decompose the host mesh and distribute it among the GPUs in
|
||||
the node.
|
||||
|
||||
The host mesh is a huge contiguous block of data. Its dimensions are given by
|
||||
the global variable named "grid". A "grid" is decomposed into "subgrids",
|
||||
one for each GPU. Here we check which parts of the range s0...s1 maps
|
||||
to the memory space stored by some GPU, ranging d0...d1, and transfer
|
||||
the data if needed.
|
||||
|
||||
The index mapping is inherently quite involved, but here's a picture which
|
||||
hopefully helps make sense out of all this.
|
||||
|
||||
|
||||
Grid
|
||||
|----num_vertices---|
|
||||
xxx|....................................................|xxx
|
||||
^ ^ ^ ^
|
||||
d0 d1 s0 (src) s1
|
||||
|
||||
Subgrid
|
||||
|
||||
xxx|.............|xxx
|
||||
^ ^
|
||||
d0 d1
|
||||
|
||||
^ ^
|
||||
db da
|
||||
*
|
||||
*/
|
||||
// #include "astaroth_defines.h"
|
||||
#include "astaroth.h"
|
||||
#include "errchk.h"
|
||||
|
||||
#include "device.cuh"
|
||||
#include "math_utils.h" // sum for reductions
|
||||
// #include "standalone/config_loader.h" // update_config
|
||||
|
||||
#define AC_GEN_STR(X) #X
|
||||
const char* intparam_names[] = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
|
||||
@@ -142,532 +31,80 @@ const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) /
|
||||
const char* vtxbuf_names[] = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
|
||||
#undef AC_GEN_STR
|
||||
|
||||
static const int MAX_NUM_DEVICES = 32;
|
||||
static int num_devices = 0;
|
||||
static Device devices[MAX_NUM_DEVICES] = {};
|
||||
|
||||
static Grid grid; // A grid consists of num_devices subgrids
|
||||
static Grid subgrid;
|
||||
|
||||
static int
|
||||
gridIdx(const Grid grid, const int3 idx)
|
||||
{
|
||||
return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
|
||||
}
|
||||
|
||||
static int3
|
||||
gridIdx3d(const Grid grid, const int idx)
|
||||
{
|
||||
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
||||
idx / (grid.m.x * grid.m.y)};
|
||||
}
|
||||
|
||||
static void
|
||||
printInt3(const int3 vec)
|
||||
{
|
||||
printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
|
||||
}
|
||||
|
||||
static inline void
|
||||
print(const AcMeshInfo config)
|
||||
{
|
||||
for (int i = 0; i < NUM_INT_PARAMS; ++i)
|
||||
printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
|
||||
for (int i = 0; i < NUM_REAL_PARAMS; ++i)
|
||||
printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
|
||||
}
|
||||
|
||||
static void
|
||||
update_builtin_params(AcMeshInfo* config)
|
||||
{
|
||||
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
||||
///////////// PAD TEST
|
||||
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
||||
///////////// PAD TEST
|
||||
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
||||
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
||||
|
||||
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
||||
config->int_params[AC_nx_min] = NGHOST;
|
||||
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
|
||||
config->int_params[AC_ny_min] = NGHOST;
|
||||
config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
|
||||
config->int_params[AC_nz_min] = NGHOST;
|
||||
config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
|
||||
|
||||
/* Additional helper params */
|
||||
// Int helpers
|
||||
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
|
||||
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
|
||||
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
|
||||
|
||||
#if VERBOSE_PRINTING // Defined in astaroth.h
|
||||
printf("###############################################################\n");
|
||||
printf("Config dimensions recalculated:\n");
|
||||
print(*config);
|
||||
printf("###############################################################\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
static Grid
|
||||
createGrid(const AcMeshInfo config)
|
||||
{
|
||||
Grid grid;
|
||||
|
||||
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
|
||||
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
|
||||
|
||||
return grid;
|
||||
}
|
||||
static const int num_nodes = 1;
|
||||
static Node nodes[num_nodes];
|
||||
|
||||
AcResult
|
||||
acCheckDeviceAvailability(void)
|
||||
acInit(const AcMeshInfo mesh_info)
|
||||
{
|
||||
int device_count; // Separate from num_devices to avoid side effects
|
||||
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&device_count));
|
||||
if (device_count > 0)
|
||||
return AC_SUCCESS;
|
||||
else
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acSynchronizeStream(const StreamType stream)
|
||||
{
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
synchronize(devices[i], stream);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
synchronize_halos(const StreamType stream)
|
||||
{
|
||||
// Exchanges the halos of subgrids
|
||||
// After this step, the data within the main grid ranging from
|
||||
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
|
||||
// has been synchronized and transferred to appropriate subgrids
|
||||
|
||||
// We loop only to num_devices - 1 since the front and back plate of the grid is not
|
||||
// transferred because their contents depend on the boundary conditions.
|
||||
|
||||
// IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
|
||||
// I.e. the halos of subgrids must contain up-to-date data!
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices - 1; ++i) {
|
||||
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
|
||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||
{
|
||||
const int3 src = (int3){0, 0, subgrid.n.z};
|
||||
const int3 dst = (int3){0, 0, 0};
|
||||
copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst,
|
||||
num_vertices);
|
||||
}
|
||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||
{
|
||||
const int3 src = (int3){0, 0, NGHOST};
|
||||
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
|
||||
copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst,
|
||||
num_vertices);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acSynchronizeMesh(void)
|
||||
{
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
synchronize_halos(STREAM_DEFAULT);
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acInit(const AcMeshInfo config)
|
||||
{
|
||||
// Get num_devices
|
||||
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
|
||||
if (num_devices < 1) {
|
||||
ERROR("No CUDA devices found!");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
if (num_devices > MAX_NUM_DEVICES) {
|
||||
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
|
||||
num_devices = MAX_NUM_DEVICES;
|
||||
}
|
||||
if (!AC_MULTIGPU_ENABLED) {
|
||||
WARNING("MULTIGPU_ENABLED was false. Using only one device");
|
||||
num_devices = 1; // Use only one device if multi-GPU is not enabled
|
||||
}
|
||||
// Check that num_devices is divisible with AC_nz. This makes decomposing the
|
||||
// problem domain to multiple GPUs much easier since we do not have to worry
|
||||
// about remainders
|
||||
ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
|
||||
|
||||
// Decompose the problem domain
|
||||
// The main grid
|
||||
grid = createGrid(config);
|
||||
|
||||
// Subgrids
|
||||
AcMeshInfo subgrid_config = config;
|
||||
subgrid_config.int_params[AC_nz] /= num_devices;
|
||||
update_builtin_params(&subgrid_config);
|
||||
subgrid = createGrid(subgrid_config);
|
||||
|
||||
// Periodic boundary conditions become weird if the system can "fold unto itself".
|
||||
ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
|
||||
|
||||
#if VERBOSE_PRINTING
|
||||
// clang-format off
|
||||
printf("Grid m "); printInt3(grid.m); printf("\n");
|
||||
printf("Grid n "); printInt3(grid.n); printf("\n");
|
||||
printf("Subrid m "); printInt3(subgrid.m); printf("\n");
|
||||
printf("Subrid n "); printInt3(subgrid.n); printf("\n");
|
||||
// clang-format on
|
||||
#endif
|
||||
|
||||
// Initialize the devices
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
createDevice(i, subgrid_config, &devices[i]);
|
||||
loadGlobalGrid(devices[i], grid);
|
||||
printDeviceInfo(devices[i]);
|
||||
}
|
||||
|
||||
// Enable peer access
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int front = (i + 1) % num_devices;
|
||||
const int back = (i - 1 + num_devices) % num_devices;
|
||||
|
||||
int can_access_front, can_access_back;
|
||||
cudaDeviceCanAccessPeer(&can_access_front, i, front);
|
||||
cudaDeviceCanAccessPeer(&can_access_back, i, back);
|
||||
#if VERBOSE_PRINTING
|
||||
printf(
|
||||
"Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
|
||||
i, front, can_access_front, back, can_access_back);
|
||||
#endif
|
||||
|
||||
cudaSetDevice(i);
|
||||
if (can_access_front) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
|
||||
}
|
||||
if (can_access_back) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
|
||||
}
|
||||
}
|
||||
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
return AC_SUCCESS;
|
||||
return acNodeCreate(0, mesh_info, &nodes[0]);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acQuit(void)
|
||||
{
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
destroyDevice(devices[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
return acNodeDestroy(nodes[0]);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
|
||||
const int3 end, const StreamType stream)
|
||||
acSynchronizeStream(const Stream stream)
|
||||
{
|
||||
// See the beginning of the file for an explanation of the index mapping
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
// DECOMPOSITION OFFSET HERE
|
||||
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
|
||||
const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
|
||||
|
||||
const int3 da = max(start, d0);
|
||||
const int3 db = min(end, d1);
|
||||
|
||||
if (db.z >= da.z) {
|
||||
const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
|
||||
const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
|
||||
rkStep(devices[i], stream, isubstep, da_local, db_local, dt);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
return acNodeSynchronizeStream(nodes[0], stream);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end)
|
||||
acLoadDeviceConstant(const AcRealParam param, const AcReal value)
|
||||
{
|
||||
return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
|
||||
return acNodeLoadConstant(nodes[0], STREAM_DEFAULT, param, value);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream)
|
||||
acLoad(const AcMesh host_mesh)
|
||||
{
|
||||
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
|
||||
const int3 end = start + grid.n;
|
||||
return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream);
|
||||
return acNodeLoadMesh(nodes[0], STREAM_DEFAULT, host_mesh);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acIntegrateStep(const int isubstep, const AcReal dt)
|
||||
acStore(AcMesh* host_mesh)
|
||||
{
|
||||
return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
|
||||
}
|
||||
|
||||
static AcResult
|
||||
local_boundcondstep(const StreamType stream)
|
||||
{
|
||||
if (num_devices == 1) {
|
||||
boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m);
|
||||
}
|
||||
else {
|
||||
// Local boundary conditions
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
|
||||
boundcondStep(devices[i], stream, d0, d1);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
global_boundcondstep(const StreamType stream)
|
||||
{
|
||||
if (num_devices > 1) {
|
||||
// With periodic boundary conditions we exchange the front and back plates of the
|
||||
// grid. The exchange is done between the first and last device (0 and num_devices - 1).
|
||||
const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
|
||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||
{
|
||||
const int3 src = (int3){0, 0, subgrid.n.z};
|
||||
const int3 dst = (int3){0, 0, 0};
|
||||
copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst,
|
||||
num_vertices);
|
||||
}
|
||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||
{
|
||||
const int3 src = (int3){0, 0, NGHOST};
|
||||
const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
|
||||
copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst,
|
||||
num_vertices);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acBoundcondStepAsync(const StreamType stream)
|
||||
{
|
||||
ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES);
|
||||
|
||||
local_boundcondstep(stream);
|
||||
acSynchronizeStream(stream);
|
||||
global_boundcondstep(stream);
|
||||
synchronize_halos(stream);
|
||||
acSynchronizeStream(stream);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acBoundcondStep(void)
|
||||
{
|
||||
return acBoundcondStepAsync(STREAM_DEFAULT);
|
||||
}
|
||||
|
||||
static AcResult
|
||||
swap_buffers(void)
|
||||
{
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
swapBuffers(devices[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
return acNodeStoreMesh(nodes[0], STREAM_DEFAULT, host_mesh);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acIntegrate(const AcReal dt)
|
||||
{
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||
acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
|
||||
swap_buffers();
|
||||
acBoundcondStep();
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
/*
|
||||
acNodeIntegrate(nodes[0], dt);
|
||||
return acBoundcondStep();
|
||||
*/
|
||||
return acNodeIntegrate(nodes[0], dt);
|
||||
}
|
||||
|
||||
static AcReal
|
||||
simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n)
|
||||
AcResult
|
||||
acBoundcondStep(void)
|
||||
{
|
||||
AcReal res = results[0];
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (rtype == RTYPE_MAX) {
|
||||
res = max(res, results[i]);
|
||||
}
|
||||
else if (rtype == RTYPE_MIN) {
|
||||
res = min(res, results[i]);
|
||||
}
|
||||
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
|
||||
res = sum(res, results[i]);
|
||||
}
|
||||
else {
|
||||
ERROR("Invalid rtype");
|
||||
}
|
||||
}
|
||||
|
||||
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
|
||||
res = sqrt(inv_n * res);
|
||||
}
|
||||
return res;
|
||||
return acNodePeriodicBoundconds(nodes[0], STREAM_DEFAULT);
|
||||
}
|
||||
|
||||
AcReal
|
||||
acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle)
|
||||
acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
|
||||
AcReal results[num_devices];
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
|
||||
}
|
||||
|
||||
return simple_final_reduce_scal(rtype, results, num_devices);
|
||||
AcReal result;
|
||||
acNodeReduceScal(nodes[0], STREAM_DEFAULT, rtype, vtxbuf_handle, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
AcReal
|
||||
acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b,
|
||||
const VertexBufferHandle c)
|
||||
{
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
|
||||
AcReal results[num_devices];
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
|
||||
}
|
||||
|
||||
return simple_final_reduce_scal(rtype, results, num_devices);
|
||||
AcReal result;
|
||||
acNodeReduceVec(nodes[0], STREAM_DEFAULT, rtype, a, b, c, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices,
|
||||
const StreamType stream)
|
||||
acStoreWithOffset(const int3 dst, const size_t num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
// See the beginning of the file for an explanation of the index mapping
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
|
||||
|
||||
const int3 s0 = src;
|
||||
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
|
||||
|
||||
const int3 da = max(s0, d0);
|
||||
const int3 db = min(s1, d1);
|
||||
/*
|
||||
printf("Device %d\n", i);
|
||||
printf("\ts0: "); printInt3(s0); printf("\n");
|
||||
printf("\td0: "); printInt3(d0); printf("\n");
|
||||
printf("\tda: "); printInt3(da); printf("\n");
|
||||
printf("\tdb: "); printInt3(db); printf("\n");
|
||||
printf("\td1: "); printInt3(d1); printf("\n");
|
||||
printf("\ts1: "); printInt3(s1); printf("\n");
|
||||
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
|
||||
*/
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
|
||||
// DECOMPOSITION OFFSET HERE
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
|
||||
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
|
||||
// printf("\n");
|
||||
copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells);
|
||||
}
|
||||
// printf("\n");
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
return acNodeStoreMeshWithOffset(nodes[0], STREAM_DEFAULT, dst, dst, num_vertices, host_mesh);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices)
|
||||
{
|
||||
return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acLoad(const AcMesh host_mesh)
|
||||
{
|
||||
acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info));
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh,
|
||||
const StreamType stream)
|
||||
{
|
||||
// See the beginning of the file for an explanation of the index mapping
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
|
||||
|
||||
const int3 s0 = src;
|
||||
const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
|
||||
|
||||
const int3 da = max(s0, d0);
|
||||
const int3 db = min(s1, d1);
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
|
||||
// DECOMPOSITION OFFSET HERE
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
|
||||
copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
|
||||
}
|
||||
|
||||
AcResult
|
||||
acStore(AcMesh* host_mesh)
|
||||
{
|
||||
acStoreWithOffset((int3){0, 0, 0}, acVertexBufferSize(host_mesh->info), host_mesh);
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream)
|
||||
{
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
loadDeviceConstant(devices[i], stream, param, value);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acLoadDeviceConstant(const AcRealParam param, const AcReal value)
|
||||
{
|
||||
return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
|
||||
}
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Revised interface
|
||||
* =============================================================================
|
||||
*/
|
||||
|
@@ -24,7 +24,7 @@
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#include "device.cuh"
|
||||
#include "astaroth_device.h"
|
||||
|
||||
#include "errchk.h"
|
||||
|
||||
@@ -40,19 +40,19 @@ typedef struct {
|
||||
} VertexBufferArray;
|
||||
|
||||
__constant__ AcMeshInfo d_mesh_info;
|
||||
__constant__ int3 d_multigpu_offset;
|
||||
__constant__ Grid globalGrid;
|
||||
#define DCONST_INT(X) (d_mesh_info.int_params[X])
|
||||
#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
|
||||
#define DCONST_REAL(X) (d_mesh_info.real_params[X])
|
||||
#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
|
||||
#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
|
||||
#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
|
||||
#define globalGridN (d_mesh_info.int3_params[AC_global_grid_n])
|
||||
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
|
||||
#include "kernels/boundconds.cuh"
|
||||
#include "kernels/integration.cuh"
|
||||
#include "kernels/reductions.cuh"
|
||||
|
||||
static dim3 rk3_tpb = (dim3){32, 1, 4};
|
||||
static dim3 rk3_tpb(32, 1, 4);
|
||||
|
||||
#if PACKED_DATA_TRANSFERS // Defined in device.cuh
|
||||
// #include "kernels/pack_unpack.cuh"
|
||||
@@ -76,8 +76,96 @@ struct device_s {
|
||||
#endif
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
static __global__ void dummy_kernel(void) {}
|
||||
// clang-format on
|
||||
|
||||
AcResult
|
||||
printDeviceInfo(const Device device)
|
||||
acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||
{
|
||||
cudaSetDevice(id);
|
||||
cudaDeviceReset();
|
||||
|
||||
// Create Device
|
||||
struct device_s* device = (struct device_s*)malloc(sizeof(*device));
|
||||
ERRCHK_ALWAYS(device);
|
||||
|
||||
device->id = id;
|
||||
device->local_config = device_config;
|
||||
|
||||
// Check that the code was compiled for the proper GPU architecture
|
||||
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
|
||||
"device supports the CUDA architecture you are compiling for.\n"
|
||||
"Running dummy kernel... ");
|
||||
fflush(stdout);
|
||||
dummy_kernel<<<1, 1>>>();
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
printf("Success!\n");
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
||||
cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
|
||||
}
|
||||
|
||||
// Memory
|
||||
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
||||
}
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
||||
|
||||
#if PACKED_DATA_TRANSFERS
|
||||
// Allocate data required for packed transfers here (cudaMalloc)
|
||||
#endif
|
||||
|
||||
// Device constants
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
printf("Created device %d (%p)\n", device->id, device);
|
||||
*device_handle = device;
|
||||
|
||||
// Autoptimize
|
||||
if (id == 0) {
|
||||
acDeviceAutoOptimize(device);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceDestroy(Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
printf("Destroying device %d (%p)\n", device->id, device);
|
||||
|
||||
// Memory
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
cudaFree(device->vba.in[i]);
|
||||
cudaFree(device->vba.out[i]);
|
||||
}
|
||||
cudaFree(device->reduce_scratchpad);
|
||||
cudaFree(device->reduce_result);
|
||||
|
||||
#if PACKED_DATA_TRANSFERS
|
||||
// Free data required for packed tranfers here (cudaFree)
|
||||
#endif
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
||||
cudaStreamDestroy(device->streams[i]);
|
||||
}
|
||||
|
||||
// Destroy Device
|
||||
free(device);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDevicePrintInfo(const Device device)
|
||||
{
|
||||
const int device_id = device->id;
|
||||
|
||||
@@ -136,305 +224,8 @@ printDeviceInfo(const Device device)
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static __global__ void
|
||||
dummy_kernel(void)
|
||||
{
|
||||
}
|
||||
|
||||
AcResult
|
||||
createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
|
||||
{
|
||||
cudaSetDevice(id);
|
||||
cudaDeviceReset();
|
||||
|
||||
// Create Device
|
||||
struct device_s* device = (struct device_s*)malloc(sizeof(*device));
|
||||
ERRCHK_ALWAYS(device);
|
||||
|
||||
device->id = id;
|
||||
device->local_config = device_config;
|
||||
|
||||
// Check that the code was compiled for the proper GPU architecture
|
||||
printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
|
||||
"device supports the CUDA architecture you are compiling for.\n"
|
||||
"Running dummy kernel... ");
|
||||
fflush(stdout);
|
||||
dummy_kernel<<<1, 1>>>();
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
printf("Success!\n");
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
||||
cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
|
||||
}
|
||||
|
||||
// Memory
|
||||
const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
|
||||
}
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
|
||||
|
||||
#if PACKED_DATA_TRANSFERS
|
||||
// Allocate data required for packed transfers here (cudaMalloc)
|
||||
#endif
|
||||
|
||||
// Device constants
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
// Multi-GPU offset. This is used to compute globalVertexIdx.
|
||||
// Might be better to calculate this in astaroth.cu instead of here, s.t.
|
||||
// everything related to the decomposition is limited to the multi-GPU layer
|
||||
const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
|
||||
sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
|
||||
|
||||
printf("Created device %d (%p)\n", device->id, device);
|
||||
*device_handle = device;
|
||||
|
||||
// Autoptimize
|
||||
if (id == 0)
|
||||
autoOptimize(device);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
destroyDevice(Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
printf("Destroying device %d (%p)\n", device->id, device);
|
||||
|
||||
// Memory
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
cudaFree(device->vba.in[i]);
|
||||
cudaFree(device->vba.out[i]);
|
||||
}
|
||||
cudaFree(device->reduce_scratchpad);
|
||||
cudaFree(device->reduce_result);
|
||||
|
||||
#if PACKED_DATA_TRANSFERS
|
||||
// Free data required for packed tranfers here (cudaFree)
|
||||
#endif
|
||||
|
||||
// Concurrency
|
||||
for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
|
||||
cudaStreamDestroy(device->streams[i]);
|
||||
}
|
||||
|
||||
// Destroy Device
|
||||
free(device);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
|
||||
device->local_config.int_params[AC_ny_min],
|
||||
device->local_config.int_params[AC_nz_min]};
|
||||
|
||||
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = reduce_scal(device->streams[stream_type], rtype, start, end,
|
||||
device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
|
||||
device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
|
||||
device->local_config.int_params[AC_ny_min],
|
||||
device->local_config.int_params[AC_nz_min]};
|
||||
|
||||
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
|
||||
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
|
||||
const int3& end, const AcReal dt)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
// const dim3 tpb(32, 1, 4);
|
||||
const dim3 tpb = rk3_tpb;
|
||||
|
||||
const int3 n = end - start;
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||
|
||||
if (step_number == 0)
|
||||
solve<0><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
|
||||
else if (step_number == 1)
|
||||
solve<1><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
|
||||
else
|
||||
solve<2><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
|
||||
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
synchronize(const Device device, const StreamType stream_type)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
if (stream_type == STREAM_ALL) {
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
else {
|
||||
cudaStreamSynchronize(device->streams[stream_type]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
|
||||
const size_t bytes, AcReal* dst)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA(
|
||||
cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
|
||||
const size_t bytes, AcReal* dst)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA(
|
||||
cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
|
||||
const int3& src, const int3& dst, const int num_vertices)
|
||||
{
|
||||
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
|
||||
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
|
||||
num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
|
||||
const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
|
||||
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
|
||||
num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
|
||||
Device dst_device, const int3& dst, const int num_vertices)
|
||||
{
|
||||
cudaSetDevice(src_device->id);
|
||||
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
|
||||
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
|
||||
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
|
||||
&src_device->vba.in[i][src_idx], src_device->id,
|
||||
sizeof(src_device->vba.in[i][0]) * num_vertices,
|
||||
src_device->streams[stream_type]));
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
swapBuffers(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
AcReal* tmp = device->vba.in[i];
|
||||
device->vba.in[i] = device->vba.out[i];
|
||||
device->vba.out[i] = tmp;
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param,
|
||||
const int value)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
// CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here)
|
||||
// and something like d_mesh_info.real_params[] cannot be directly accessed.
|
||||
// Therefore we have to obfuscate the code a bit and compute the offset address before
|
||||
// invoking cudaMemcpyToSymbol.
|
||||
const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice,
|
||||
device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param,
|
||||
const AcReal value)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice,
|
||||
device->streams[stream_type]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
loadGlobalGrid(const Device device, const Grid grid)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
autoOptimize(const Device device)
|
||||
acDeviceAutoOptimize(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
@@ -513,12 +304,303 @@ autoOptimize(const Device device)
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceSynchronizeStream(const Device device, const Stream stream)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
if (stream == STREAM_ALL) {
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
else {
|
||||
cudaStreamSynchronize(device->streams[stream]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceSwapBuffers(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
AcReal* tmp = device->vba.in[i];
|
||||
device->vba.in[i] = device->vba.out[i];
|
||||
device->vba.out[i] = tmp;
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
|
||||
const AcReal value)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
|
||||
ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
|
||||
cudaMemcpyHostToDevice, device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
|
||||
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
|
||||
|
||||
const AcReal* src_ptr = &host_mesh.vertex_buffer[vtxbuf_handle][src_idx];
|
||||
AcReal* dst_ptr = &device->vba.in[vtxbuf_handle][dst_idx];
|
||||
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
|
||||
|
||||
ERRCHK_CUDA( //
|
||||
cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
|
||||
);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMesh host_mesh,
|
||||
const int3 src, const int3 dst, const int num_vertices)
|
||||
{
|
||||
WARNING("This function is deprecated");
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
|
||||
dst, num_vertices);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
const int3 src = (int3){0, 0, 0};
|
||||
const int3 dst = src;
|
||||
const size_t num_vertices = acVertexBufferSize(device->local_config);
|
||||
acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst,
|
||||
num_vertices);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
|
||||
{
|
||||
WARNING("This function is deprecated");
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
|
||||
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
|
||||
|
||||
const AcReal* src_ptr = &device->vba.in[vtxbuf_handle][src_idx];
|
||||
AcReal* dst_ptr = &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx];
|
||||
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
|
||||
|
||||
ERRCHK_CUDA( //
|
||||
cudaMemcpyAsync(dst_ptr, src_ptr, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
|
||||
);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
|
||||
const int3 dst, const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("This function is deprecated");
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
|
||||
num_vertices, host_mesh);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceStoreVertexBuffer(const Device device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
|
||||
{
|
||||
int3 src = (int3){0, 0, 0};
|
||||
int3 dst = src;
|
||||
const size_t num_vertices = acVertexBufferSize(device->local_config);
|
||||
|
||||
acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices,
|
||||
host_mesh);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("This function is deprecated");
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices, Device dst_device)
|
||||
{
|
||||
cudaSetDevice(src_device->id);
|
||||
const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
|
||||
const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
|
||||
|
||||
const AcReal* src_ptr = &src_device->vba.in[vtxbuf_handle][src_idx];
|
||||
AcReal* dst_ptr = &dst_device->vba.in[vtxbuf_handle][dst_idx];
|
||||
const size_t bytes = num_vertices * sizeof(src_ptr[0]);
|
||||
|
||||
ERRCHK_CUDA(cudaMemcpyPeerAsync(dst_ptr, dst_device->id, src_ptr, src_device->id, bytes,
|
||||
src_device->streams[stream]));
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3 src,
|
||||
const int3 dst, const int num_vertices, Device dst_device)
|
||||
{
|
||||
WARNING("This function is deprecated");
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acDeviceTransferVertexBufferWithOffset(src_device, stream, (VertexBufferHandle)i, src, dst,
|
||||
num_vertices, dst_device);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, Device dst_device)
|
||||
{
|
||||
int3 src = (int3){0, 0, 0};
|
||||
int3 dst = src;
|
||||
const size_t num_vertices = acVertexBufferSize(src_device->local_config);
|
||||
|
||||
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
||||
num_vertices, dst_device);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceTransferMesh(const Device src_device, const Stream stream, Device dst_device)
|
||||
{
|
||||
WARNING("This function is deprecated");
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acDeviceTransferVertexBuffer(src_device, stream, (VertexBufferHandle)i, dst_device);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
|
||||
const int3 start, const int3 end, const AcReal dt)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
const dim3 tpb = rk3_tpb;
|
||||
|
||||
const int3 n = end - start;
|
||||
const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||
|
||||
if (step_number == 0)
|
||||
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
||||
else if (step_number == 1)
|
||||
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
||||
else
|
||||
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
||||
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDevicePeriodicBoundcondStep(const Device device, const Stream stream_type,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 start,
|
||||
const int3 end)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const cudaStream_t stream = device->streams[stream_type];
|
||||
|
||||
const dim3 tpb(8, 2, 8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
(unsigned int)ceil((end.y - start.y) / (float)tpb.y),
|
||||
(unsigned int)ceil((end.z - start.z) / (float)tpb.z));
|
||||
|
||||
kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, device->vba.in[vtxbuf_handle]);
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
|
||||
const int3 end)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
|
||||
device->local_config.int_params[AC_ny_min],
|
||||
device->local_config.int_params[AC_nz_min]};
|
||||
|
||||
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
|
||||
const VertexBufferHandle vtxbuf2, AcReal* result)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
|
||||
const int3 start = (int3){device->local_config.int_params[AC_nx_min],
|
||||
device->local_config.int_params[AC_ny_min],
|
||||
device->local_config.int_params[AC_nz_min]};
|
||||
|
||||
const int3 end = (int3){device->local_config.int_params[AC_nx_max],
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
|
||||
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
#if PACKED_DATA_TRANSFERS
|
||||
// Functions for calling packed data transfers
|
||||
#endif
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Revised interface
|
||||
* =============================================================================
|
||||
*/
|
||||
|
@@ -1,107 +0,0 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Brief info.
|
||||
*
|
||||
* Detailed info.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
#include "astaroth.h"
|
||||
|
||||
typedef struct {
|
||||
int3 m;
|
||||
int3 n;
|
||||
} Grid;
|
||||
|
||||
typedef struct device_s* Device; // Opaque pointer to device_s. Analogous to dispatchable handles
|
||||
// in Vulkan, f.ex. VkDevice
|
||||
|
||||
/** */
|
||||
AcResult printDeviceInfo(const Device device);
|
||||
|
||||
/** */
|
||||
AcResult createDevice(const int id, const AcMeshInfo device_config, Device* device);
|
||||
|
||||
/** */
|
||||
AcResult destroyDevice(Device device);
|
||||
|
||||
/** */
|
||||
AcResult boundcondStep(const Device device, const StreamType stream_type, const int3& start,
|
||||
const int3& end);
|
||||
|
||||
/** */
|
||||
AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||
|
||||
/** */
|
||||
AcResult reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
|
||||
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
|
||||
const VertexBufferHandle vec2, AcReal* result);
|
||||
|
||||
/** */
|
||||
AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
|
||||
const int3& start, const int3& end, const AcReal dt);
|
||||
|
||||
/** Sychronizes the device with respect to stream_type. If STREAM_ALL is given as
|
||||
a StreamType, the function synchronizes all streams on the device. */
|
||||
AcResult synchronize(const Device device, const StreamType stream_type);
|
||||
|
||||
/** */
|
||||
AcResult copyMeshToDevice(const Device device, const StreamType stream_type,
|
||||
const AcMesh& host_mesh, const int3& src, const int3& dst,
|
||||
const int num_vertices);
|
||||
|
||||
/** */
|
||||
AcResult copyMeshToHost(const Device device, const StreamType stream_type, const int3& src,
|
||||
const int3& dst, const int num_vertices, AcMesh* host_mesh);
|
||||
|
||||
/** */
|
||||
AcResult copyMeshDeviceToDevice(const Device src, const StreamType stream_type, const int3& src_idx,
|
||||
Device dst, const int3& dst_idx, const int num_vertices);
|
||||
|
||||
/** Swaps the input/output buffers used in computations */
|
||||
AcResult swapBuffers(const Device device);
|
||||
|
||||
/** */
|
||||
AcResult loadDeviceConstant(const Device device, const StreamType stream_type,
|
||||
const AcIntParam param, const int value);
|
||||
|
||||
/** */
|
||||
AcResult loadDeviceConstant(const Device device, const StreamType stream_type,
|
||||
const AcRealParam param, const AcReal value);
|
||||
|
||||
/** */
|
||||
AcResult loadGlobalGrid(const Device device, const Grid grid);
|
||||
|
||||
/** */
|
||||
AcResult autoOptimize(const Device device);
|
||||
|
||||
// #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers
|
||||
#if PACKED_DATA_TRANSFERS
|
||||
// Declarations used for packed data transfers
|
||||
#endif
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Revised interface
|
||||
* =============================================================================
|
||||
*/
|
208
src/core/grid.cu
Normal file
208
src/core/grid.cu
Normal file
@@ -0,0 +1,208 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#include "astaroth_grid.h"
|
||||
|
||||
#include "astaroth_node.h"
|
||||
|
||||
const size_t MAX_NUM_NODES = 32;
|
||||
size_t num_nodes = 0;
|
||||
static Node nodes[MAX_NUM_NODES];
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridInit(const AcMeshInfo node_config)
|
||||
{
|
||||
acNodeCreate(0, node_config, &nodes[0]);
|
||||
++num_nodes;
|
||||
WARNING("Proper multinode not yet implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridQuit(void)
|
||||
{
|
||||
acNodeDestroy(nodes[0]);
|
||||
--num_nodes;
|
||||
WARNING("Proper multinode not yet implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridSynchronizeStream(const Stream stream)
|
||||
{
|
||||
for (int i = 0; i < num_nodes; ++i) {
|
||||
acNodeSynchronizeStream(nodes[i], stream);
|
||||
}
|
||||
WARNING("Proper multinode not yet implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridSwapBuffers(void)
|
||||
{
|
||||
for (int i = 0; i < num_nodes; ++i) {
|
||||
acNodeSwapBuffers(nodes[i]);
|
||||
}
|
||||
WARNING("Proper multinode not yet implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridLoadConstant(const Stream stream, const AcRealParam param, const AcReal value)
|
||||
{
|
||||
for (int i = 0; i < num_nodes; ++i) {
|
||||
acNodeLoadConstant(node, stream, param, value);
|
||||
}
|
||||
WARNING("Proper multinode not yet implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridLoadVertexBufferWithOffset(const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices)
|
||||
{
|
||||
for (int i = 0; i < num_nodes; ++i) {
|
||||
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle)
|
||||
}
|
||||
WARNING("Proper multinode not yet implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridLoadMeshWithOffset(const Stream stream, const AcMesh host_mesh, const int3 src,
|
||||
const int3 dst, const int num_vertices)
|
||||
{
|
||||
for (int i = 0; i < num_nodes; ++i) {
|
||||
acNodeLoadMeshWithOffset(nodes[i], stream, host_mesh, src, dst, num_vertices);
|
||||
}
|
||||
WARNING("Proper multinode not yet implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridLoadVertexBuffer(const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
for (int i = 0; i < num_nodes; ++i) {
|
||||
acNodeLoadVertexBuffer(node, stream, host_mesh, vtxbuf_handle);
|
||||
}
|
||||
WARNING("Proper multinode not yet implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridLoadMesh(const Stream stream, const AcMesh host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridStoreVertexBufferWithOffset(const Stream stream, const VertexBufferHandle vtxbuf_handle,
|
||||
const int3 src, const int3 dst, const int num_vertices,
|
||||
AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridStoreMeshWithOffset(const Stream stream, const int3 src, const int3 dst,
|
||||
const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridStoreVertexBuffer(const Stream stream, const VertexBufferHandle vtxbuf_handle,
|
||||
AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridStoreMesh(const Stream stream, AcMesh* host_mesh)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start, const int3 end,
|
||||
const AcReal dt)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridIntegrateStep(const Stream stream, const AcReal dt)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
/** */
|
||||
AcResult
|
||||
acGridPeriodicBoundcondStep(const Stream stream)
|
||||
{
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/** */
|
||||
AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result);
|
||||
/** */
|
||||
AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vec0, const VertexBufferHandle vec1,
|
||||
const VertexBufferHandle vec2, AcReal* result);
|
||||
#endif
|
||||
/** */
|
||||
AcResult
|
||||
acGridReduceScal(const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
/** */
|
||||
AcResult
|
||||
acGridReduceVec(const Stream stream, const ReductionType rtype, const VertexBufferHandle vec0,
|
||||
const VertexBufferHandle vec1, const VertexBufferHandle vec2, AcReal* result)
|
||||
{
|
||||
return 0;
|
||||
}
|
818
src/core/node.cu
Normal file
818
src/core/node.cu
Normal file
@@ -0,0 +1,818 @@
|
||||
/*
|
||||
Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
|
||||
|
||||
This file is part of Astaroth.
|
||||
|
||||
Astaroth is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Astaroth is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with Astaroth. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* \brief Multi-GPU implementation.
|
||||
*
|
||||
%JP: The old way for computing boundary conditions conflicts with the
|
||||
way we have to do things with multiple GPUs.
|
||||
|
||||
The older approach relied on unified memory, which represented the whole
|
||||
memory area as one huge mesh instead of several smaller ones. However, unified memory
|
||||
in its current state is more meant for quick prototyping when performance is not an issue.
|
||||
Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
|
||||
than when managing the memory explicitly.
|
||||
|
||||
In this new approach, I have simplified the multi- and single-GPU layers significantly.
|
||||
Quick rundown:
|
||||
New struct: Grid. There are two global variables, "grid" and "subgrid", which
|
||||
contain the extents of the whole simulation domain and the decomposed grids,
|
||||
respectively. To simplify thing, we require that each GPU is assigned the same amount of
|
||||
work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
|
||||
work with.
|
||||
|
||||
The whole simulation domain is decomposed with respect to the z dimension.
|
||||
For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
|
||||
contain (nx, ny, nz / num_devices) vertices.
|
||||
|
||||
An local index (i, j, k) in some subgrid can be mapped to the global grid with
|
||||
global idx = (i, j, k + device_id * subgrid.n.z)
|
||||
|
||||
Terminology:
|
||||
- Single-GPU function: a function defined on the single-GPU layer (device.cu)
|
||||
|
||||
Changes required to this commented code block:
|
||||
- The thread block dimensions (tpb) are no longer passed to the kernel here but in
|
||||
device.cu instead. Same holds for any complex index calculations. Instead, the local
|
||||
coordinates should be passed as an int3 type without having to consider how the data is
|
||||
actually laid out in device memory
|
||||
- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
|
||||
handle of type "Device" which should be passed to single-GPU functions. In this file, all
|
||||
devices are stored in a global array "devices[num_devices]".
|
||||
- Every single-GPU function is executed asynchronously by default such that we
|
||||
can optimize Astaroth by executing memory transactions concurrently with
|
||||
computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
|
||||
Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
|
||||
as a parameter and commands executing in different streams can be processed
|
||||
in parallel/concurrently.
|
||||
|
||||
|
||||
Note on periodic boundaries (might be helpful when implementing other boundary conditions):
|
||||
|
||||
With multiple GPUs, periodic boundary conditions applied on indices ranging from
|
||||
|
||||
(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
|
||||
STENCIL_ORDER/2)
|
||||
|
||||
on a single device are "local", in the sense that they can be computed without
|
||||
having to exchange data with neighboring GPUs. Special care is needed only for transferring
|
||||
the data to the fron and back plates outside this range. In the solution we use
|
||||
here, we solve the local boundaries first, and then just exchange the front and back plates
|
||||
in a "ring", like so
|
||||
device_id
|
||||
(n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
|
||||
|
||||
### Throughout this file we use the following notation and names for various index offsets
|
||||
|
||||
Global coordinates: coordinates with respect to the global grid (static Grid grid)
|
||||
Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
|
||||
|
||||
s0, s1: source indices in global coordinates
|
||||
d0, d1: destination indices in global coordinates
|
||||
da = max(s0, d0);
|
||||
db = min(s1, d1);
|
||||
|
||||
These are used in at least
|
||||
acLoad()
|
||||
acStore()
|
||||
acSynchronizeHalos()
|
||||
|
||||
Here we decompose the host mesh and distribute it among the GPUs in
|
||||
the node.
|
||||
|
||||
The host mesh is a huge contiguous block of data. Its dimensions are given by
|
||||
the global variable named "grid". A "grid" is decomposed into "subgrids",
|
||||
one for each GPU. Here we check which parts of the range s0...s1 maps
|
||||
to the memory space stored by some GPU, ranging d0...d1, and transfer
|
||||
the data if needed.
|
||||
|
||||
The index mapping is inherently quite involved, but here's a picture which
|
||||
hopefully helps make sense out of all this.
|
||||
|
||||
|
||||
Grid
|
||||
|----num_vertices---|
|
||||
xxx|....................................................|xxx
|
||||
^ ^ ^ ^
|
||||
d0 d1 s0 (src) s1
|
||||
|
||||
Subgrid
|
||||
|
||||
xxx|.............|xxx
|
||||
^ ^
|
||||
d0 d1
|
||||
|
||||
^ ^
|
||||
db da
|
||||
*
|
||||
*/
|
||||
#include "astaroth_node.h"
|
||||
|
||||
#include "astaroth_device.h"
|
||||
#include "errchk.h"
|
||||
#include "math_utils.h" // sum for reductions
|
||||
|
||||
static const int MAX_NUM_DEVICES = 32;
|
||||
|
||||
typedef struct {
|
||||
int3 m;
|
||||
int3 n;
|
||||
} Grid;
|
||||
|
||||
struct node_s {
|
||||
int id;
|
||||
|
||||
int num_devices;
|
||||
Device devices[MAX_NUM_DEVICES];
|
||||
|
||||
Grid grid;
|
||||
Grid subgrid;
|
||||
|
||||
AcMeshInfo config;
|
||||
};
|
||||
|
||||
static int
|
||||
gridIdx(const Grid grid, const int3 idx)
|
||||
{
|
||||
return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
|
||||
}
|
||||
|
||||
static int3
|
||||
gridIdx3d(const Grid grid, const int idx)
|
||||
{
|
||||
return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
|
||||
idx / (grid.m.x * grid.m.y)};
|
||||
}
|
||||
|
||||
static void
|
||||
printInt3(const int3 vec)
|
||||
{
|
||||
printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
|
||||
}
|
||||
|
||||
static inline void
|
||||
print(const AcMeshInfo config)
|
||||
{
|
||||
for (int i = 0; i < NUM_INT_PARAMS; ++i)
|
||||
printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
|
||||
for (int i = 0; i < NUM_REAL_PARAMS; ++i)
|
||||
printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
|
||||
}
|
||||
|
||||
static void
|
||||
update_builtin_params(AcMeshInfo* config)
|
||||
{
|
||||
config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
|
||||
///////////// PAD TEST
|
||||
// config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
|
||||
///////////// PAD TEST
|
||||
config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
|
||||
config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
|
||||
|
||||
// Bounds for the computational domain, i.e. nx_min <= i < nx_max
|
||||
config->int_params[AC_nx_min] = NGHOST;
|
||||
config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
|
||||
config->int_params[AC_ny_min] = NGHOST;
|
||||
config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
|
||||
config->int_params[AC_nz_min] = NGHOST;
|
||||
config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
|
||||
|
||||
/* Additional helper params */
|
||||
// Int helpers
|
||||
config->int_params[AC_mxy] = config->int_params[AC_mx] * config->int_params[AC_my];
|
||||
config->int_params[AC_nxy] = config->int_params[AC_nx] * config->int_params[AC_ny];
|
||||
config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
|
||||
}
|
||||
|
||||
static Grid
|
||||
createGrid(const AcMeshInfo config)
|
||||
{
|
||||
Grid grid;
|
||||
|
||||
grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
|
||||
grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
|
||||
|
||||
return grid;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeCreate(const int id, const AcMeshInfo node_config, Node* node_handle)
|
||||
{
|
||||
struct node_s* node = (struct node_s*)malloc(sizeof(*node));
|
||||
node->id = id;
|
||||
node->config = node_config;
|
||||
|
||||
// Get node->num_devices
|
||||
ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&node->num_devices));
|
||||
if (node->num_devices < 1) {
|
||||
ERROR("No CUDA devices found!");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
if (node->num_devices > MAX_NUM_DEVICES) {
|
||||
WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
|
||||
node->num_devices = MAX_NUM_DEVICES;
|
||||
}
|
||||
if (!AC_MULTIGPU_ENABLED) {
|
||||
WARNING("MULTIGPU_ENABLED was false. Using only one device");
|
||||
node->num_devices = 1; // Use only one device if multi-GPU is not enabled
|
||||
}
|
||||
// Check that node->num_devices is divisible with AC_nz. This makes decomposing the
|
||||
// problem domain to multiple GPUs much easier since we do not have to worry
|
||||
// about remainders
|
||||
ERRCHK_ALWAYS(node->config.int_params[AC_nz] % node->num_devices == 0);
|
||||
|
||||
// Decompose the problem domain
|
||||
// The main grid
|
||||
node->grid = createGrid(node->config);
|
||||
|
||||
// Subgrids
|
||||
AcMeshInfo subgrid_config = node->config;
|
||||
subgrid_config.int_params[AC_nz] /= node->num_devices;
|
||||
update_builtin_params(&subgrid_config);
|
||||
#if VERBOSE_PRINTING // Defined in astaroth.h
|
||||
printf("###############################################################\n");
|
||||
printf("Config dimensions recalculated:\n");
|
||||
print(subgrid_config);
|
||||
printf("###############################################################\n");
|
||||
#endif
|
||||
node->subgrid = createGrid(subgrid_config);
|
||||
|
||||
// Periodic boundary conditions become weird if the system can "fold unto itself".
|
||||
ERRCHK_ALWAYS(node->subgrid.n.x >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(node->subgrid.n.y >= STENCIL_ORDER);
|
||||
ERRCHK_ALWAYS(node->subgrid.n.z >= STENCIL_ORDER);
|
||||
|
||||
#if VERBOSE_PRINTING
|
||||
// clang-format off
|
||||
printf("Grid m "); printInt3(node->grid.m); printf("\n");
|
||||
printf("Grid n "); printInt3(node->grid.n); printf("\n");
|
||||
printf("Subrid m "); printInt3(node->subgrid.m); printf("\n");
|
||||
printf("Subrid n "); printInt3(node->subgrid.n); printf("\n");
|
||||
// clang-format on
|
||||
#endif
|
||||
|
||||
// Initialize the devices
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
const int3 multinode_offset = (int3){0, 0, 0}; // Placeholder
|
||||
const int3 multigpu_offset = (int3){0, 0, i * node->subgrid.n.z};
|
||||
subgrid_config.int3_params[AC_global_grid_n] = node->grid.n;
|
||||
subgrid_config.int3_params[AC_multigpu_offset] = multinode_offset + multigpu_offset;
|
||||
|
||||
acDeviceCreate(i, subgrid_config, &node->devices[i]);
|
||||
acDevicePrintInfo(node->devices[i]);
|
||||
}
|
||||
|
||||
// Enable peer access
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
const int front = (i + 1) % node->num_devices;
|
||||
const int back = (i - 1 + node->num_devices) % node->num_devices;
|
||||
|
||||
int can_access_front, can_access_back;
|
||||
cudaDeviceCanAccessPeer(&can_access_front, i, front);
|
||||
cudaDeviceCanAccessPeer(&can_access_back, i, back);
|
||||
#if VERBOSE_PRINTING
|
||||
printf(
|
||||
"Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
|
||||
i, front, can_access_front, back, can_access_back);
|
||||
#endif
|
||||
|
||||
cudaSetDevice(i);
|
||||
if (can_access_front) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
|
||||
}
|
||||
if (can_access_back) {
|
||||
ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
|
||||
}
|
||||
}
|
||||
acNodeSynchronizeStream(node, STREAM_ALL);
|
||||
|
||||
*node_handle = node;
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeDestroy(Node node)
|
||||
{
|
||||
acNodeSynchronizeStream(node, STREAM_ALL);
|
||||
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
acDeviceDestroy(node->devices[i]);
|
||||
}
|
||||
free(node);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodePrintInfo(const Node node)
|
||||
{
|
||||
(void)node;
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeQueryDeviceConfiguration(const Node node, DeviceConfiguration* config)
|
||||
{
|
||||
(void)node;
|
||||
(void)config;
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeAutoOptimize(const Node node)
|
||||
{
|
||||
(void)node;
|
||||
WARNING("Not implemented");
|
||||
return AC_FAILURE;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeSynchronizeStream(const Node node, const Stream stream)
|
||||
{
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
acDeviceSynchronizeStream(node->devices[i], stream);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeSynchronizeVertexBuffer(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
acNodeSynchronizeStream(node, stream);
|
||||
// Exchanges the halos of subgrids
|
||||
// After this step, the data within the main grid ranging from
|
||||
// (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
|
||||
// has been synchronized and transferred to appropriate subgrids
|
||||
|
||||
// We loop only to node->num_devices - 1 since the front and back plate of the grid is not
|
||||
// transferred because their contents depend on the boundary conditions.
|
||||
|
||||
// IMPORTANT NOTE: the boundary conditions must be applied before
|
||||
// callingacNodeSynchronizeStream(node, this function! I.e. the halos of subgrids must contain
|
||||
// up-to-date data!
|
||||
|
||||
const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
|
||||
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices - 1; ++i) {
|
||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||
const int3 src = (int3){0, 0, node->subgrid.n.z};
|
||||
const int3 dst = (int3){0, 0, 0};
|
||||
|
||||
const Device src_device = node->devices[i];
|
||||
Device dst_device = node->devices[i + 1];
|
||||
|
||||
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
||||
num_vertices, dst_device);
|
||||
}
|
||||
// #pragma omp parallel for
|
||||
for (int i = 1; i < node->num_devices; ++i) {
|
||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||
const int3 src = (int3){0, 0, NGHOST};
|
||||
const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
|
||||
|
||||
const Device src_device = node->devices[i];
|
||||
Device dst_device = node->devices[i - 1];
|
||||
|
||||
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
||||
num_vertices, dst_device);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeSynchronizeMesh(const Node node, const Stream stream)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acNodeSynchronizeVertexBuffer(node, stream, (VertexBufferHandle)i);
|
||||
}
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeSwapBuffers(const Node node)
|
||||
{
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
acDeviceSwapBuffers(node->devices[i]);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadConstant(const Node node, const Stream stream, const AcRealParam param,
|
||||
const AcReal value)
|
||||
{
|
||||
acNodeSynchronizeStream(node, stream);
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
acDeviceLoadConstant(node->devices[i], stream, param, value);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadVertexBufferWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices)
|
||||
{
|
||||
acNodeSynchronizeStream(node, stream);
|
||||
// See the beginning of the file for an explanation of the index mapping
|
||||
// // #pragma omp parallel for
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
|
||||
|
||||
const int3 s0 = src; // dst; // TODO fix
|
||||
(void)dst; // TODO fix
|
||||
const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
|
||||
|
||||
const int3 da = max(s0, d0);
|
||||
const int3 db = min(s1, d1);
|
||||
/*
|
||||
printf("Device %d\n", i);
|
||||
printf("\ts0: "); printInt3(s0); printf("\n");
|
||||
printf("\td0: "); printInt3(d0); printf("\n");
|
||||
printf("\tda: "); printInt3(da); printf("\n");
|
||||
printf("\tdb: "); printInt3(db); printf("\n");
|
||||
printf("\td1: "); printInt3(d1); printf("\n");
|
||||
printf("\ts1: "); printInt3(s1); printf("\n");
|
||||
printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
|
||||
*/
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
|
||||
// DECOMPOSITION OFFSET HERE
|
||||
const int3 da_global = da; // src + da - dst; // TODO fix
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
|
||||
// printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
|
||||
// printf("\n");
|
||||
acDeviceLoadVertexBufferWithOffset(node->devices[i], stream, host_mesh, vtxbuf_handle,
|
||||
da_global, da_local, copy_cells);
|
||||
}
|
||||
// printf("\n");
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadMeshWithOffset(const Node node, const Stream stream, const AcMesh host_mesh,
|
||||
const int3 src, const int3 dst, const int num_vertices)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, (VertexBufferHandle)i, src, dst,
|
||||
num_vertices);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadVertexBuffer(const Node node, const Stream stream, const AcMesh host_mesh,
|
||||
const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
const int3 src = (int3){0, 0, 0};
|
||||
const int3 dst = src;
|
||||
const size_t num_vertices = acVertexBufferSize(host_mesh.info);
|
||||
|
||||
acNodeLoadVertexBufferWithOffset(node, stream, host_mesh, vtxbuf_handle, src, dst,
|
||||
num_vertices);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeLoadMesh(const Node node, const Stream stream, const AcMesh host_mesh)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acNodeLoadVertexBuffer(node, stream, host_mesh, (VertexBufferHandle)i);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, const int3 src,
|
||||
const int3 dst, const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
acNodeSynchronizeStream(node, stream);
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
|
||||
|
||||
const int3 s0 = src; // TODO fix
|
||||
(void)dst; // TODO fix
|
||||
const int3 s1 = gridIdx3d(node->grid, gridIdx(node->grid, s0) + num_vertices);
|
||||
|
||||
const int3 da = max(s0, d0);
|
||||
const int3 db = min(s1, d1);
|
||||
if (db.z >= da.z) {
|
||||
const int copy_cells = gridIdx(node->subgrid, db) - gridIdx(node->subgrid, da);
|
||||
// DECOMPOSITION OFFSET HERE
|
||||
const int3 da_local = (int3){da.x, da.y, da.z - i * node->grid.n.z / node->num_devices};
|
||||
const int3 da_global = da; // dst + da - src; // TODO fix
|
||||
acDeviceStoreVertexBufferWithOffset(node->devices[i], stream, vtxbuf_handle, da_local,
|
||||
da_global, copy_cells, host_mesh);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeStoreMeshWithOffset(const Node node, const Stream stream, const int3 src, const int3 dst,
|
||||
const int num_vertices, AcMesh* host_mesh)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acNodeStoreVertexBufferWithOffset(node, stream, (VertexBufferHandle)i, src, dst,
|
||||
num_vertices, host_mesh);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeStoreVertexBuffer(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
|
||||
{
|
||||
const int3 src = (int3){0, 0, 0};
|
||||
const int3 dst = src;
|
||||
const size_t num_vertices = acVertexBufferSize(host_mesh->info);
|
||||
|
||||
acNodeStoreVertexBufferWithOffset(node, stream, vtxbuf_handle, src, dst, num_vertices,
|
||||
host_mesh);
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeStoreMesh(const Node node, const Stream stream, AcMesh* host_mesh)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acNodeStoreVertexBuffer(node, stream, (VertexBufferHandle)i, host_mesh);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeIntegrateSubstep(const Node node, const Stream stream, const int isubstep, const int3 start,
|
||||
const int3 end, const AcReal dt)
|
||||
{
|
||||
acNodeSynchronizeStream(node, stream);
|
||||
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
// DECOMPOSITION OFFSET HERE
|
||||
const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * node->subgrid.n.z};
|
||||
const int3 d1 = d0 + (int3){node->subgrid.n.x, node->subgrid.n.y, node->subgrid.n.z};
|
||||
|
||||
const int3 da = max(start, d0);
|
||||
const int3 db = min(end, d1);
|
||||
|
||||
if (db.z >= da.z) {
|
||||
const int3 da_local = da - (int3){0, 0, i * node->subgrid.n.z};
|
||||
const int3 db_local = db - (int3){0, 0, i * node->subgrid.n.z};
|
||||
acDeviceIntegrateSubstep(node->devices[i], stream, isubstep, da_local, db_local, dt);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
local_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf)
|
||||
{
|
||||
acNodeSynchronizeStream(node, stream);
|
||||
|
||||
if (node->num_devices > 1) {
|
||||
// Local boundary conditions
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
|
||||
const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.n.z};
|
||||
acDevicePeriodicBoundcondStep(node->devices[i], stream, vtxbuf, d0, d1);
|
||||
}
|
||||
}
|
||||
else {
|
||||
acDevicePeriodicBoundcondStep(node->devices[0], stream, vtxbuf, (int3){0, 0, 0},
|
||||
node->subgrid.m);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcResult
|
||||
global_boundcondstep(const Node node, const Stream stream, const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
acNodeSynchronizeStream(node, stream);
|
||||
|
||||
if (node->num_devices > 1) {
|
||||
const size_t num_vertices = node->subgrid.m.x * node->subgrid.m.y * NGHOST;
|
||||
{
|
||||
// ...|ooooxxx|... -> xxx|ooooooo|...
|
||||
const int3 src = (int3){0, 0, node->subgrid.n.z};
|
||||
const int3 dst = (int3){0, 0, 0};
|
||||
|
||||
const Device src_device = node->devices[node->num_devices - 1];
|
||||
Device dst_device = node->devices[0];
|
||||
|
||||
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
||||
num_vertices, dst_device);
|
||||
}
|
||||
{
|
||||
// ...|ooooooo|xxx <- ...|xxxoooo|...
|
||||
const int3 src = (int3){0, 0, NGHOST};
|
||||
const int3 dst = (int3){0, 0, NGHOST + node->subgrid.n.z};
|
||||
|
||||
const Device src_device = node->devices[0];
|
||||
Device dst_device = node->devices[node->num_devices - 1];
|
||||
|
||||
acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
|
||||
num_vertices, dst_device);
|
||||
}
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeIntegrate(const Node node, const AcReal dt)
|
||||
{
|
||||
acNodeSynchronizeStream(node, STREAM_ALL);
|
||||
// xxx|OOO OOOOOOOOO OOO|xxx
|
||||
// ^ ^ ^ ^
|
||||
// n0 n1 n2 n3
|
||||
// const int3 n0 = (int3){NGHOST, NGHOST, NGHOST};
|
||||
// const int3 n1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
||||
// const int3 n2 = node->grid.n;
|
||||
// const int3 n3 = n0 + node->grid.n;
|
||||
|
||||
for (int isubstep = 0; isubstep < 3; ++isubstep) {
|
||||
acNodeSynchronizeStream(node, STREAM_ALL);
|
||||
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||
local_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
|
||||
}
|
||||
acNodeSynchronizeStream(node, STREAM_ALL);
|
||||
|
||||
// Inner inner
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
const int3 m1 = (int3){2 * NGHOST, 2 * NGHOST, 2 * NGHOST};
|
||||
const int3 m2 = node->subgrid.n;
|
||||
acDeviceIntegrateSubstep(node->devices[i], STREAM_16, isubstep, m1, m2, dt);
|
||||
}
|
||||
|
||||
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||
acNodeSynchronizeVertexBuffer(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
|
||||
global_boundcondstep(node, (Stream)vtxbuf, (VertexBufferHandle)vtxbuf);
|
||||
}
|
||||
for (int vtxbuf = 0; vtxbuf < NUM_VTXBUF_HANDLES; ++vtxbuf) {
|
||||
acNodeSynchronizeStream(node, (Stream)vtxbuf);
|
||||
}
|
||||
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) { // Front
|
||||
const int3 m1 = (int3){NGHOST, NGHOST, NGHOST};
|
||||
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
|
||||
acDeviceIntegrateSubstep(node->devices[i], STREAM_0, isubstep, m1, m2, dt);
|
||||
}
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) { // Back
|
||||
const int3 m1 = (int3){NGHOST, NGHOST, node->subgrid.n.z};
|
||||
const int3 m2 = m1 + (int3){node->subgrid.n.x, node->subgrid.n.y, NGHOST};
|
||||
acDeviceIntegrateSubstep(node->devices[i], STREAM_1, isubstep, m1, m2, dt);
|
||||
}
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) { // Bottom
|
||||
const int3 m1 = (int3){NGHOST, NGHOST, 2 * NGHOST};
|
||||
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
|
||||
acDeviceIntegrateSubstep(node->devices[i], STREAM_2, isubstep, m1, m2, dt);
|
||||
}
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) { // Top
|
||||
const int3 m1 = (int3){NGHOST, node->subgrid.n.y, 2 * NGHOST};
|
||||
const int3 m2 = m1 + (int3){node->subgrid.n.x, NGHOST, node->subgrid.n.z - 2 * NGHOST};
|
||||
acDeviceIntegrateSubstep(node->devices[i], STREAM_3, isubstep, m1, m2, dt);
|
||||
}
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) { // Left
|
||||
const int3 m1 = (int3){NGHOST, 2 * NGHOST, 2 * NGHOST};
|
||||
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
|
||||
node->subgrid.n.z - 2 * NGHOST};
|
||||
acDeviceIntegrateSubstep(node->devices[i], STREAM_4, isubstep, m1, m2, dt);
|
||||
}
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) { // Right
|
||||
const int3 m1 = (int3){node->subgrid.n.x, 2 * NGHOST, 2 * NGHOST};
|
||||
const int3 m2 = m1 + (int3){NGHOST, node->subgrid.n.y - 2 * NGHOST,
|
||||
node->subgrid.n.z - 2 * NGHOST};
|
||||
acDeviceIntegrateSubstep(node->devices[i], STREAM_5, isubstep, m1, m2, dt);
|
||||
}
|
||||
acNodeSwapBuffers(node);
|
||||
}
|
||||
acNodeSynchronizeStream(node, STREAM_ALL);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodePeriodicBoundcondStep(const Node node, const Stream stream,
|
||||
const VertexBufferHandle vtxbuf_handle)
|
||||
{
|
||||
local_boundcondstep(node, stream, vtxbuf_handle);
|
||||
acNodeSynchronizeVertexBuffer(node, stream, vtxbuf_handle);
|
||||
|
||||
// TODO NOTE GLOBAL BOUNDCONDS NOT DONE HERE IF MORE THAN 1 NODE
|
||||
global_boundcondstep(node, stream, vtxbuf_handle);
|
||||
// WARNING("Global boundconds should not be done here with multinode");
|
||||
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodePeriodicBoundconds(const Node node, const Stream stream)
|
||||
{
|
||||
for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
|
||||
acNodePeriodicBoundcondStep(node, stream, (VertexBufferHandle)i);
|
||||
}
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
static AcReal
|
||||
simple_final_reduce_scal(const Node node, const ReductionType& rtype, const AcReal* results,
|
||||
const int& n)
|
||||
{
|
||||
AcReal res = results[0];
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (rtype == RTYPE_MAX) {
|
||||
res = max(res, results[i]);
|
||||
}
|
||||
else if (rtype == RTYPE_MIN) {
|
||||
res = min(res, results[i]);
|
||||
}
|
||||
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
|
||||
res = sum(res, results[i]);
|
||||
}
|
||||
else {
|
||||
ERROR("Invalid rtype");
|
||||
}
|
||||
}
|
||||
|
||||
if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||
const AcReal inv_n = AcReal(1.) / (node->grid.n.x * node->grid.n.y * node->grid.n.z);
|
||||
res = sqrt(inv_n * res);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle vtxbuf_handle, AcReal* result)
|
||||
{
|
||||
acNodeSynchronizeStream(node, STREAM_ALL);
|
||||
|
||||
AcReal results[node->num_devices];
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
acDeviceReduceScal(node->devices[i], stream, rtype, vtxbuf_handle, &results[i]);
|
||||
}
|
||||
|
||||
*result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
AcResult
|
||||
acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
|
||||
const VertexBufferHandle a, const VertexBufferHandle b, const VertexBufferHandle c,
|
||||
AcReal* result)
|
||||
{
|
||||
acNodeSynchronizeStream(node, STREAM_ALL);
|
||||
|
||||
AcReal results[node->num_devices];
|
||||
// #pragma omp parallel for
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
acDeviceReduceVec(node->devices[i], stream, rtype, a, b, c, &results[i]);
|
||||
}
|
||||
|
||||
*result = simple_final_reduce_scal(node, rtype, results, node->num_devices);
|
||||
return AC_SUCCESS;
|
||||
}
|
@@ -29,13 +29,13 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include "config_loader.h"
|
||||
#include "src/core/math_utils.h"
|
||||
#include "model/host_forcing.h"
|
||||
#include "model/host_memory.h"
|
||||
#include "model/host_timestep.h"
|
||||
#include "model/model_boundconds.h"
|
||||
#include "model/model_reduce.h"
|
||||
#include "model/model_rk3.h"
|
||||
#include "src/core/math_utils.h"
|
||||
|
||||
#include "src/core/errchk.h"
|
||||
|
||||
@@ -431,8 +431,9 @@ check_rk3(const AcMeshInfo& mesh_info)
|
||||
acIntegrate(dt);
|
||||
|
||||
model_rk3(dt, model_mesh);
|
||||
boundconds(model_mesh->info, model_mesh);
|
||||
}
|
||||
boundconds(model_mesh->info, model_mesh);
|
||||
acBoundcondStep();
|
||||
acStore(gpu_mesh);
|
||||
|
||||
bool is_acceptable = verify_meshes(*model_mesh, *gpu_mesh);
|
||||
@@ -726,6 +727,7 @@ run_autotest(void)
|
||||
|
||||
// Device integration step
|
||||
acIntegrate(dt);
|
||||
acBoundcondStep();
|
||||
acStore(candidate_mesh);
|
||||
|
||||
// Check fields
|
||||
|
@@ -197,7 +197,6 @@ run_benchmark(void)
|
||||
double(results[int(NTH_PERCENTILE * NUM_ITERS)]), int(NTH_PERCENTILE * 100),
|
||||
mesh_info.int_params[AC_nx]);
|
||||
|
||||
acStore(mesh);
|
||||
acQuit();
|
||||
acmesh_destroy(mesh);
|
||||
}
|
||||
@@ -269,7 +268,6 @@ run_benchmark(void)
|
||||
write_result_to_file(wallclock * 1e3f / steps);
|
||||
#endif
|
||||
|
||||
acStore(mesh);
|
||||
acQuit();
|
||||
acmesh_destroy(mesh);
|
||||
|
||||
|
@@ -43,6 +43,9 @@ typedef struct {
|
||||
ModelScalar value;
|
||||
ModelVector gradient;
|
||||
ModelMatrix hessian;
|
||||
#if LUPWD
|
||||
ModelVector upwind;
|
||||
#endif
|
||||
} ModelScalarData;
|
||||
|
||||
typedef struct {
|
||||
@@ -273,6 +276,53 @@ derzz(const int i, const int j, const int k, const ModelScalar* arr)
|
||||
return second_derivative(pencil, get(AC_inv_dsz));
|
||||
}
|
||||
|
||||
#if LUPWD
|
||||
static inline ModelScalar
|
||||
der6x_upwd(const int i, const int j, const int k, const ModelScalar* arr)
|
||||
{
|
||||
ModelScalar inv_ds = get(AC_inv_dsx);
|
||||
|
||||
return ModelScalar(1.0/60.0)*inv_ds* (
|
||||
-ModelScalar( 20.0)* arr[IDX(i, j, k)]
|
||||
+ModelScalar( 15.0)*(arr[IDX(i+1, j, k)]
|
||||
+ arr[IDX(i-1, j, k)])
|
||||
-ModelScalar( 6.0)*(arr[IDX(i+2, j, k)]
|
||||
+ arr[IDX(i-2, j, k)])
|
||||
+ arr[IDX(i+3, j, k)]
|
||||
+ arr[IDX(i-3, j, k)]);
|
||||
}
|
||||
|
||||
static inline ModelScalar
|
||||
der6y_upwd(const int i, const int j, const int k, const ModelScalar* arr)
|
||||
{
|
||||
ModelScalar inv_ds = get(AC_inv_dsy);
|
||||
|
||||
return ModelScalar(1.0/60.0)*inv_ds* (
|
||||
-ModelScalar( 20.0)* arr[IDX(i, j, k)]
|
||||
+ModelScalar( 15.0)*(arr[IDX(i, j+1, k)]
|
||||
+ arr[IDX(i, j-1, k)])
|
||||
-ModelScalar( 6.0)*(arr[IDX(i, j+2, k)]
|
||||
+ arr[IDX(i, j-2, k)])
|
||||
+ arr[IDX(i, j+3, k)]
|
||||
+ arr[IDX(i, j-3, k)]);
|
||||
}
|
||||
|
||||
static inline ModelScalar
|
||||
der6z_upwd(const int i, const int j, const int k, const ModelScalar* arr)
|
||||
{
|
||||
ModelScalar inv_ds = get(AC_inv_dsz);
|
||||
|
||||
return ModelScalar(1.0/60.0)*inv_ds* (
|
||||
-ModelScalar( 20.0)* arr[IDX(i, j, k )]
|
||||
+ModelScalar( 15.0)*(arr[IDX(i, j, k+1)]
|
||||
+ arr[IDX(i, j, k-1)])
|
||||
-ModelScalar( 6.0)*(arr[IDX(i, j, k+2)]
|
||||
+ arr[IDX(i, j, k-2)])
|
||||
+ arr[IDX(i, j, k+3)]
|
||||
+ arr[IDX(i, j, k-3)]);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline ModelScalar
|
||||
compute_value(const int i, const int j, const int k, const ModelScalar* arr)
|
||||
{
|
||||
@@ -285,6 +335,14 @@ compute_gradient(const int i, const int j, const int k, const ModelScalar* arr)
|
||||
return (ModelVector){derx(i, j, k, arr), dery(i, j, k, arr), derz(i, j, k, arr)};
|
||||
}
|
||||
|
||||
#if LUPWD
|
||||
static inline ModelVector
|
||||
compute_upwind(const int i, const int j, const int k, const ModelScalar* arr)
|
||||
{
|
||||
return (ModelVector){der6x_upwd(i, j, k, arr), der6y_upwd(i, j, k, arr), der6z_upwd(i, j, k, arr)};
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline ModelMatrix
|
||||
compute_hessian(const int i, const int j, const int k, const ModelScalar* arr)
|
||||
{
|
||||
@@ -309,6 +367,10 @@ read_data(const int i, const int j, const int k, ModelScalar* buf[], const int h
|
||||
// diagonals with all arrays
|
||||
data.hessian = compute_hessian(i, j, k, buf[handle]);
|
||||
|
||||
#if LUPWD
|
||||
data.upwind = compute_upwind(i, j, k, buf[handle]);
|
||||
#endif
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
@@ -354,6 +416,8 @@ gradients(const ModelVectorData& data)
|
||||
return (ModelMatrix){gradient(data.x), gradient(data.y), gradient(data.z)};
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* =============================================================================
|
||||
* Level 0.3 (Built-in functions available during the Stencil Processing Stage)
|
||||
@@ -502,10 +566,27 @@ contract(const ModelMatrix& mat)
|
||||
* Stencil Processing Stage (equations)
|
||||
* =============================================================================
|
||||
*/
|
||||
|
||||
#if LUPWD
|
||||
ModelScalar
|
||||
upwd_der6(const ModelVectorData& uu, const ModelScalarData& lnrho)
|
||||
{
|
||||
ModelScalar uux = fabs(value(uu).x);
|
||||
ModelScalar uuy = fabs(value(uu).y);
|
||||
ModelScalar uuz = fabs(value(uu).z);
|
||||
return uux*lnrho.upwind.x + uuy*lnrho.upwind.y + uuz*lnrho.upwind.z;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline ModelScalar
|
||||
continuity(const ModelVectorData& uu, const ModelScalarData& lnrho)
|
||||
{
|
||||
return -dot(value(uu), gradient(lnrho)) - divergence(uu);
|
||||
return -dot(value(uu), gradient(lnrho))
|
||||
#if LUPWD
|
||||
//This is a corrective hyperdiffusion term for upwinding.
|
||||
+ upwd_der6(uu, lnrho)
|
||||
#endif
|
||||
- divergence(uu);
|
||||
}
|
||||
|
||||
static inline ModelScalar
|
||||
@@ -679,23 +760,24 @@ helical_forcing(ModelScalar magnitude, ModelVector k_force, ModelVector xx, Mode
|
||||
ModelVector ff_im, ModelScalar phi)
|
||||
{
|
||||
(void)magnitude; // WARNING: unused
|
||||
xx.x = xx.x * (2.0l * M_PI / (get(AC_dsx) * (get(AC_ny_max) - get(AC_ny_min))));
|
||||
xx.y = xx.y * (2.0l * M_PI / (get(AC_dsy) * (get(AC_ny_max) - get(AC_ny_min))));
|
||||
xx.z = xx.z * (2.0l * M_PI / (get(AC_dsz) * (get(AC_ny_max) - get(AC_ny_min))));
|
||||
xx.x = xx.x*(2.0*M_PI/(get(AC_dsx)*get(AC_nx)));
|
||||
xx.y = xx.y*(2.0*M_PI/(get(AC_dsy)*get(AC_ny)));
|
||||
xx.z = xx.z*(2.0*M_PI/(get(AC_dsz)*get(AC_nz)));
|
||||
|
||||
ModelScalar cosl_phi = cosl(phi);
|
||||
ModelScalar sinl_phi = sinl(phi);
|
||||
ModelScalar cosl_k_dox_x = cosl(dot(k_force, xx));
|
||||
ModelScalar sinl_k_dox_x = sinl(dot(k_force, xx));
|
||||
ModelScalar cos_phi = cosl(phi);
|
||||
ModelScalar sin_phi = sinl(phi);
|
||||
ModelScalar cos_k_dot_x = cosl(dot(k_force, xx));
|
||||
ModelScalar sin_k_dot_x = sinl(dot(k_force, xx));
|
||||
// Phase affect only the x-component
|
||||
// ModelScalar real_comp = cosl_k_dox_x;
|
||||
// ModelScalar imag_comp = sinl_k_dox_x;
|
||||
ModelScalar real_comp_phase = cosl_k_dox_x * cosl_phi - sinl_k_dox_x * sinl_phi;
|
||||
ModelScalar imag_comp_phase = cosl_k_dox_x * sinl_phi + sinl_k_dox_x * cosl_phi;
|
||||
//Scalar real_comp = cos_k_dot_x;
|
||||
//Scalar imag_comp = sin_k_dot_x;
|
||||
ModelScalar real_comp_phase = cos_k_dot_x*cos_phi - sin_k_dot_x*sin_phi;
|
||||
ModelScalar imag_comp_phase = cos_k_dot_x*sin_phi + sin_k_dot_x*cos_phi;
|
||||
|
||||
ModelVector force = (ModelVector){ff_re.x * real_comp_phase - ff_im.x * imag_comp_phase,
|
||||
ff_re.y * real_comp_phase - ff_im.y * imag_comp_phase,
|
||||
ff_re.z * real_comp_phase - ff_im.z * imag_comp_phase};
|
||||
|
||||
ModelVector force = (ModelVector){ ff_re.x*real_comp_phase - ff_im.x*imag_comp_phase,
|
||||
ff_re.y*real_comp_phase - ff_im.y*imag_comp_phase,
|
||||
ff_re.z*real_comp_phase - ff_im.z*imag_comp_phase};
|
||||
|
||||
return force;
|
||||
}
|
||||
|
@@ -32,13 +32,13 @@
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include "config_loader.h"
|
||||
#include "src/core/errchk.h"
|
||||
#include "src/core/math_utils.h"
|
||||
#include "model/host_forcing.h"
|
||||
#include "model/host_memory.h"
|
||||
#include "model/host_timestep.h"
|
||||
#include "model/model_reduce.h"
|
||||
#include "model/model_rk3.h"
|
||||
#include "src/core/errchk.h"
|
||||
#include "src/core/math_utils.h"
|
||||
#include "timer_hires.h"
|
||||
|
||||
// Window
|
||||
@@ -106,6 +106,7 @@ renderer_init(const int& mx, const int& my)
|
||||
// Setup window
|
||||
window = SDL_CreateWindow("Astaroth", SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
|
||||
window_width, window_height, SDL_WINDOW_SHOWN);
|
||||
ERRCHK_ALWAYS(window);
|
||||
|
||||
// Setup SDL renderer
|
||||
renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
|
||||
@@ -409,9 +410,10 @@ run_renderer(void)
|
||||
/* Render */
|
||||
const float timer_diff_sec = timer_diff_nsec(frame_timer) / 1e9f;
|
||||
if (timer_diff_sec >= desired_frame_time) {
|
||||
// acStore(mesh);
|
||||
const int num_vertices = mesh->info.int_params[AC_mxy];
|
||||
const int3 dst = (int3){0, 0, k_slice};
|
||||
acBoundcondStep();
|
||||
// acStore(mesh);
|
||||
acStoreWithOffset(dst, num_vertices, mesh);
|
||||
acSynchronizeStream(STREAM_ALL);
|
||||
renderer_draw(*mesh); // Bottleneck is here
|
||||
@@ -421,7 +423,6 @@ run_renderer(void)
|
||||
}
|
||||
printf("Wallclock time %f s\n", double(timer_diff_nsec(wallclock) / 1e9f));
|
||||
|
||||
acStore(mesh);
|
||||
acQuit();
|
||||
acmesh_destroy(mesh);
|
||||
|
||||
|
@@ -27,13 +27,13 @@
|
||||
#include "run.h"
|
||||
|
||||
#include "config_loader.h"
|
||||
#include "src/core/errchk.h"
|
||||
#include "src/core/math_utils.h"
|
||||
#include "model/host_forcing.h"
|
||||
#include "model/host_memory.h"
|
||||
#include "model/host_timestep.h"
|
||||
#include "model/model_reduce.h"
|
||||
#include "model/model_rk3.h"
|
||||
#include "src/core/errchk.h"
|
||||
#include "src/core/math_utils.h"
|
||||
#include "timer_hires.h"
|
||||
|
||||
#include <string.h>
|
||||
@@ -216,6 +216,7 @@ run_simulation(void)
|
||||
write_mesh_info(&mesh_info);
|
||||
print_diagnostics(0, AcReal(.0), t_step, diag_file);
|
||||
|
||||
acBoundcondStep();
|
||||
acStore(mesh);
|
||||
save_mesh(*mesh, 0, t_step);
|
||||
|
||||
@@ -229,13 +230,13 @@ run_simulation(void)
|
||||
/* initialize random seed: */
|
||||
srand(312256655);
|
||||
|
||||
//TODO_SINK. init_sink_particle()
|
||||
// TODO_SINK. init_sink_particle()
|
||||
// Initialize the basic variables of the sink particle to a suitable initial value.
|
||||
// 1. Location of the particle
|
||||
// 2. Mass of the particle
|
||||
// (3. Velocity of the particle)
|
||||
// This at the level of Host in this case.
|
||||
// acUpdate_sink_particle() will do the similar trick to the device.
|
||||
// This at the level of Host in this case.
|
||||
// acUpdate_sink_particle() will do the similar trick to the device.
|
||||
|
||||
/* Step the simulation */
|
||||
for (int i = 1; i < max_steps; ++i) {
|
||||
@@ -247,28 +248,28 @@ run_simulation(void)
|
||||
loadForcingParamsToDevice(forcing_params);
|
||||
#endif
|
||||
|
||||
|
||||
//TODO_SINK acUpdate_sink_particle()
|
||||
// Update properties of the sing particle for acIntegrate(). Essentially:
|
||||
// 1. Location of the particle
|
||||
// 2. Mass of the particle
|
||||
// (3. Velocity of the particle)
|
||||
// 3. Velocity of the particle)
|
||||
// These can be used for calculating he gravitational field.
|
||||
// This is my first comment! by Jack
|
||||
// This is my second comment! by Jack
|
||||
|
||||
acIntegrate(dt);
|
||||
//TODO_SINK acAdvect_sink_particle()
|
||||
// THIS IS OPTIONAL. We will start from unmoving particle.
|
||||
// 1. Calculate the equation of motion for the sink particle.
|
||||
// NOTE: Might require embedding with acIntegrate(dt).
|
||||
// TODO_SINK acAdvect_sink_particle()
|
||||
// THIS IS OPTIONAL. We will start from unmoving particle.
|
||||
// 1. Calculate the equation of motion for the sink particle.
|
||||
// NOTE: Might require embedding with acIntegrate(dt).
|
||||
|
||||
//TODO_SINK acAccrete_sink_particle()
|
||||
// Calculate accretion of the sink particle from the surrounding medium
|
||||
// 1. Transfer density into sink particle mass
|
||||
// 2. Transfer momentum into sink particle
|
||||
// (OPTIONAL: Affection the motion of the particle)
|
||||
// NOTE: Might require embedding with acIntegrate(dt).
|
||||
// This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference.
|
||||
// TODO_SINK acAccrete_sink_particle()
|
||||
// Calculate accretion of the sink particle from the surrounding medium
|
||||
// 1. Transfer density into sink particle mass
|
||||
// 2. Transfer momentum into sink particle
|
||||
// (OPTIONAL: Affection the motion of the particle)
|
||||
// NOTE: Might require embedding with acIntegrate(dt).
|
||||
// This is the hardest part. Please see Lee et al. ApJ 783 (2014) for reference.
|
||||
|
||||
t_step += dt;
|
||||
|
||||
@@ -311,9 +312,10 @@ run_simulation(void)
|
||||
assumed to be asynchronous, so the meshes must be also synchronized
|
||||
before transferring the data to the CPU. Like so:
|
||||
|
||||
acSynchronize();
|
||||
acBoundcondStep();
|
||||
acStore(mesh);
|
||||
*/
|
||||
acBoundcondStep();
|
||||
acStore(mesh);
|
||||
|
||||
save_mesh(*mesh, i, t_step);
|
||||
|
Reference in New Issue
Block a user