Reformatted
This commit is contained in:
@@ -26,7 +26,6 @@ extern "C" {
|
||||
#include <stdlib.h> // size_t
|
||||
//#include <vector_types.h> // CUDA vector types (float4, etc)
|
||||
|
||||
//#ifndef __CUDACC__
|
||||
#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
|
||||
#include <cuda_runtime_api.h>
|
||||
#else
|
||||
@@ -46,7 +45,6 @@ typedef struct {
|
||||
double x, y, z;
|
||||
} double3;
|
||||
#endif
|
||||
//#endif // __CUDACC__
|
||||
|
||||
// Library flags
|
||||
#define STENCIL_ORDER (6)
|
||||
|
@@ -26,8 +26,8 @@
|
||||
*/
|
||||
#include "astaroth_device.h"
|
||||
|
||||
#include "math_utils.h"
|
||||
#include "errchk.h"
|
||||
#include "math_utils.h"
|
||||
|
||||
#include "kernels/common.cuh"
|
||||
|
||||
@@ -105,8 +105,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
||||
}
|
||||
|
||||
// Reductions
|
||||
ERRCHK_CUDA_ALWAYS(
|
||||
cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad,
|
||||
acVertexBufferCompdomainSizeBytes(device_config)));
|
||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
|
||||
|
||||
#if AC_MPI_ENABLED
|
||||
@@ -242,7 +242,7 @@ acDeviceAutoOptimize(const Device device)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
|
||||
const int3 end = (int3){device->local_config.int_params[AC_mx], //
|
||||
const int3 end = (int3){device->local_config.int_params[AC_mx], //
|
||||
device->local_config.int_params[AC_my], //
|
||||
device->local_config.int_params[AC_mz]};
|
||||
return acKernelAutoOptimizeIntegration(start, end, device->vba);
|
||||
@@ -528,7 +528,8 @@ acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
|
||||
const int3 end)
|
||||
{
|
||||
cudaSetDevice(device->id);
|
||||
return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]);
|
||||
return acKernelPeriodicBoundconds(device->streams[stream], start, end,
|
||||
device->vba.in[vtxbuf_handle]);
|
||||
}
|
||||
|
||||
AcResult
|
||||
@@ -555,8 +556,9 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType
|
||||
device->local_config.int_params[AC_ny_max],
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
*result = acKernelReduceScal(device->streams[stream], rtype, start, end,
|
||||
device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
|
||||
device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -576,8 +578,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
|
||||
device->local_config.int_params[AC_nz_max]};
|
||||
|
||||
*result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
|
||||
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
|
||||
device->reduce_scratchpad, device->reduce_result);
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
|
@@ -76,7 +76,8 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
||||
}
|
||||
|
||||
AcResult
|
||||
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
||||
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
|
||||
AcReal* vtxbuf)
|
||||
{
|
||||
const dim3 tpb(8, 2, 8);
|
||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||
|
@@ -28,5 +28,5 @@
|
||||
|
||||
#include "astaroth.h"
|
||||
|
||||
AcResult
|
||||
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf);
|
||||
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
|
||||
AcReal* vtxbuf);
|
||||
|
@@ -72,9 +72,6 @@ DCONST(const VertexBufferHandle handle)
|
||||
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
|
||||
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
|
||||
|
||||
|
||||
|
||||
|
||||
static __device__ constexpr int
|
||||
IDX(const int i)
|
||||
{
|
||||
@@ -93,19 +90,9 @@ IDX(const int3 idx)
|
||||
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//#include <thrust/complex.h>
|
||||
// using namespace thrust;
|
||||
|
||||
|
||||
|
||||
|
||||
#include <cuComplex.h>
|
||||
#if AC_DOUBLE_PRECISION == 1
|
||||
typedef cuDoubleComplex acComplex;
|
||||
|
@@ -128,11 +128,11 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
|
||||
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
|
||||
return; \
|
||||
\
|
||||
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
|
||||
vertexIdx.z < DCONST(AC_nz_max)); \
|
||||
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
|
||||
vertexIdx.z < DCONST(AC_nz_max)); \
|
||||
\
|
||||
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
|
||||
vertexIdx.z >= DCONST(AC_nz_min)); \
|
||||
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
|
||||
vertexIdx.z >= DCONST(AC_nz_min)); \
|
||||
\
|
||||
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
|
||||
|
||||
@@ -293,4 +293,3 @@ acKernelDummy(void)
|
||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||
return AC_SUCCESS;
|
||||
}
|
||||
|
||||
|
@@ -26,11 +26,9 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
AcResult
|
||||
acKernelDummy(void);
|
||||
AcResult acKernelDummy(void);
|
||||
|
||||
AcResult
|
||||
acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
|
||||
AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
|
||||
|
||||
AcResult
|
||||
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba);
|
||||
AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
|
||||
const int3 start, const int3 end, VertexBufferArray vba);
|
||||
|
@@ -174,7 +174,7 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
|
||||
|
||||
AcReal
|
||||
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
@@ -227,9 +227,9 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i
|
||||
}
|
||||
|
||||
AcReal
|
||||
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
||||
AcReal* reduce_result)
|
||||
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
|
||||
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
|
||||
{
|
||||
const unsigned nx = end.x - start.x;
|
||||
const unsigned ny = end.y - start.y;
|
||||
|
@@ -27,11 +27,10 @@
|
||||
#pragma once
|
||||
#include <astaroth.h>
|
||||
|
||||
AcReal
|
||||
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result);
|
||||
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad,
|
||||
AcReal* reduce_result);
|
||||
|
||||
AcReal
|
||||
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
||||
AcReal* reduce_result);
|
||||
AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
|
||||
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
|
||||
|
@@ -547,13 +547,14 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
|
||||
for (int i = 0; i < node->num_devices; ++i) {
|
||||
// OLD: ambiguous behaviour, transferred also halos between devices and assumed
|
||||
// that halos are in sync
|
||||
//const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
//const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
|
||||
// const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
|
||||
// const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
|
||||
|
||||
// New: Transfer ghost zones, but do not transfer overlapping halos.
|
||||
// DECOMPOSITION OFFSET HERE (d0 & d1)
|
||||
int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z};
|
||||
int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, NGHOST + (i + 1) * node->subgrid.n.z};
|
||||
int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y,
|
||||
NGHOST + (i + 1) * node->subgrid.n.z};
|
||||
if (i == 0)
|
||||
d0.z = 0;
|
||||
if (i == node->num_devices - 1)
|
||||
|
Reference in New Issue
Block a user