Reformatted

This commit is contained in:
jpekkila
2019-12-03 15:14:26 +02:00
parent f14e35620c
commit 5a6a3110df
10 changed files with 37 additions and 52 deletions

View File

@@ -26,7 +26,6 @@ extern "C" {
#include <stdlib.h> // size_t
//#include <vector_types.h> // CUDA vector types (float4, etc)
//#ifndef __CUDACC__
#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
#include <cuda_runtime_api.h>
#else
@@ -46,7 +45,6 @@ typedef struct {
double x, y, z;
} double3;
#endif
//#endif // __CUDACC__
// Library flags
#define STENCIL_ORDER (6)

View File

@@ -26,8 +26,8 @@
*/
#include "astaroth_device.h"
#include "math_utils.h"
#include "errchk.h"
#include "math_utils.h"
#include "kernels/common.cuh"
@@ -105,8 +105,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
}
// Reductions
ERRCHK_CUDA_ALWAYS(
cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad,
acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
#if AC_MPI_ENABLED
@@ -242,7 +242,7 @@ acDeviceAutoOptimize(const Device device)
{
cudaSetDevice(device->id);
const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = (int3){device->local_config.int_params[AC_mx], //
const int3 end = (int3){device->local_config.int_params[AC_mx], //
device->local_config.int_params[AC_my], //
device->local_config.int_params[AC_mz]};
return acKernelAutoOptimizeIntegration(start, end, device->vba);
@@ -528,7 +528,8 @@ acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
const int3 end)
{
cudaSetDevice(device->id);
return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]);
return acKernelPeriodicBoundconds(device->streams[stream], start, end,
device->vba.in[vtxbuf_handle]);
}
AcResult
@@ -555,8 +556,9 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType
device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]};
*result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
device->reduce_scratchpad, device->reduce_result);
*result = acKernelReduceScal(device->streams[stream], rtype, start, end,
device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
device->reduce_result);
return AC_SUCCESS;
}
@@ -576,8 +578,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
device->local_config.int_params[AC_nz_max]};
*result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
device->reduce_scratchpad, device->reduce_result);
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS;
}

View File

@@ -76,7 +76,8 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
}
AcResult
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
AcReal* vtxbuf)
{
const dim3 tpb(8, 2, 8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),

View File

@@ -28,5 +28,5 @@
#include "astaroth.h"
AcResult
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf);
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
AcReal* vtxbuf);

View File

@@ -72,9 +72,6 @@ DCONST(const VertexBufferHandle handle)
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
static __device__ constexpr int
IDX(const int i)
{
@@ -93,19 +90,9 @@ IDX(const int3 idx)
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
}
//#include <thrust/complex.h>
// using namespace thrust;
#include <cuComplex.h>
#if AC_DOUBLE_PRECISION == 1
typedef cuDoubleComplex acComplex;

View File

@@ -128,11 +128,11 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
return; \
\
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
vertexIdx.z < DCONST(AC_nz_max)); \
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
vertexIdx.z < DCONST(AC_nz_max)); \
\
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
vertexIdx.z >= DCONST(AC_nz_min)); \
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
vertexIdx.z >= DCONST(AC_nz_min)); \
\
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
@@ -293,4 +293,3 @@ acKernelDummy(void)
ERRCHK_CUDA_KERNEL_ALWAYS();
return AC_SUCCESS;
}

View File

@@ -26,11 +26,9 @@
*/
#pragma once
AcResult
acKernelDummy(void);
AcResult acKernelDummy(void);
AcResult
acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
AcResult
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba);
AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
const int3 start, const int3 end, VertexBufferArray vba);

View File

@@ -174,7 +174,7 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
AcReal
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
{
const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y;
@@ -227,9 +227,9 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i
}
AcReal
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
AcReal* reduce_result)
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
{
const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y;

View File

@@ -27,11 +27,10 @@
#pragma once
#include <astaroth.h>
AcReal
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result);
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad,
AcReal* reduce_result);
AcReal
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
AcReal* reduce_result);
AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);

View File

@@ -547,13 +547,14 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
for (int i = 0; i < node->num_devices; ++i) {
// OLD: ambiguous behaviour, transferred also halos between devices and assumed
// that halos are in sync
//const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
//const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
// const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
// const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
// New: Transfer ghost zones, but do not transfer overlapping halos.
// DECOMPOSITION OFFSET HERE (d0 & d1)
int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z};
int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, NGHOST + (i + 1) * node->subgrid.n.z};
int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y,
NGHOST + (i + 1) * node->subgrid.n.z};
if (i == 0)
d0.z = 0;
if (i == node->num_devices - 1)