Reformatted

This commit is contained in:
jpekkila
2019-12-03 15:14:26 +02:00
parent f14e35620c
commit 5a6a3110df
10 changed files with 37 additions and 52 deletions

View File

@@ -26,7 +26,6 @@ extern "C" {
#include <stdlib.h> // size_t #include <stdlib.h> // size_t
//#include <vector_types.h> // CUDA vector types (float4, etc) //#include <vector_types.h> // CUDA vector types (float4, etc)
//#ifndef __CUDACC__
#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__) #if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#else #else
@@ -46,7 +45,6 @@ typedef struct {
double x, y, z; double x, y, z;
} double3; } double3;
#endif #endif
//#endif // __CUDACC__
// Library flags // Library flags
#define STENCIL_ORDER (6) #define STENCIL_ORDER (6)

View File

@@ -26,8 +26,8 @@
*/ */
#include "astaroth_device.h" #include "astaroth_device.h"
#include "math_utils.h"
#include "errchk.h" #include "errchk.h"
#include "math_utils.h"
#include "kernels/common.cuh" #include "kernels/common.cuh"
@@ -105,8 +105,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
} }
// Reductions // Reductions
ERRCHK_CUDA_ALWAYS( ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad,
cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config))); acVertexBufferCompdomainSizeBytes(device_config)));
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal))); ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
#if AC_MPI_ENABLED #if AC_MPI_ENABLED
@@ -242,7 +242,7 @@ acDeviceAutoOptimize(const Device device)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
const int3 start = (int3){NGHOST, NGHOST, NGHOST}; const int3 start = (int3){NGHOST, NGHOST, NGHOST};
const int3 end = (int3){device->local_config.int_params[AC_mx], // const int3 end = (int3){device->local_config.int_params[AC_mx], //
device->local_config.int_params[AC_my], // device->local_config.int_params[AC_my], //
device->local_config.int_params[AC_mz]}; device->local_config.int_params[AC_mz]};
return acKernelAutoOptimizeIntegration(start, end, device->vba); return acKernelAutoOptimizeIntegration(start, end, device->vba);
@@ -528,7 +528,8 @@ acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
const int3 end) const int3 end)
{ {
cudaSetDevice(device->id); cudaSetDevice(device->id);
return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]); return acKernelPeriodicBoundconds(device->streams[stream], start, end,
device->vba.in[vtxbuf_handle]);
} }
AcResult AcResult
@@ -555,8 +556,9 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType
device->local_config.int_params[AC_ny_max], device->local_config.int_params[AC_ny_max],
device->local_config.int_params[AC_nz_max]}; device->local_config.int_params[AC_nz_max]};
*result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle], *result = acKernelReduceScal(device->streams[stream], rtype, start, end,
device->reduce_scratchpad, device->reduce_result); device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
device->reduce_result);
return AC_SUCCESS; return AC_SUCCESS;
} }
@@ -576,8 +578,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
device->local_config.int_params[AC_nz_max]}; device->local_config.int_params[AC_nz_max]};
*result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0], *result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
device->vba.in[vtxbuf1], device->vba.in[vtxbuf2], device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
device->reduce_scratchpad, device->reduce_result); device->reduce_scratchpad, device->reduce_result);
return AC_SUCCESS; return AC_SUCCESS;
} }

View File

@@ -76,7 +76,8 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
} }
AcResult AcResult
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf) acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
AcReal* vtxbuf)
{ {
const dim3 tpb(8, 2, 8); const dim3 tpb(8, 2, 8);
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x), const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),

View File

@@ -28,5 +28,5 @@
#include "astaroth.h" #include "astaroth.h"
AcResult AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf); AcReal* vtxbuf);

View File

@@ -72,9 +72,6 @@ DCONST(const VertexBufferHandle handle)
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset]) #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
static __device__ constexpr int static __device__ constexpr int
IDX(const int i) IDX(const int i)
{ {
@@ -93,19 +90,9 @@ IDX(const int3 idx)
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z); return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
} }
//#include <thrust/complex.h> //#include <thrust/complex.h>
// using namespace thrust; // using namespace thrust;
#include <cuComplex.h> #include <cuComplex.h>
#if AC_DOUBLE_PRECISION == 1 #if AC_DOUBLE_PRECISION == 1
typedef cuDoubleComplex acComplex; typedef cuDoubleComplex acComplex;

View File

@@ -128,11 +128,11 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \ if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z) \
return; \ return; \
\ \
assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \ assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) && \
vertexIdx.z < DCONST(AC_nz_max)); \ vertexIdx.z < DCONST(AC_nz_max)); \
\ \
assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \ assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) && \
vertexIdx.z >= DCONST(AC_nz_min)); \ vertexIdx.z >= DCONST(AC_nz_min)); \
\ \
const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z); const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
@@ -293,4 +293,3 @@ acKernelDummy(void)
ERRCHK_CUDA_KERNEL_ALWAYS(); ERRCHK_CUDA_KERNEL_ALWAYS();
return AC_SUCCESS; return AC_SUCCESS;
} }

View File

@@ -26,11 +26,9 @@
*/ */
#pragma once #pragma once
AcResult AcResult acKernelDummy(void);
acKernelDummy(void);
AcResult AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
AcResult AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba); const int3 start, const int3 end, VertexBufferArray vba);

View File

@@ -174,7 +174,7 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
AcReal AcReal
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result) const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
{ {
const unsigned nx = end.x - start.x; const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y; const unsigned ny = end.y - start.y;
@@ -227,9 +227,9 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i
} }
AcReal AcReal
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
AcReal* reduce_result) const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
{ {
const unsigned nx = end.x - start.x; const unsigned nx = end.x - start.x;
const unsigned ny = end.y - start.y; const unsigned ny = end.y - start.y;

View File

@@ -27,11 +27,10 @@
#pragma once #pragma once
#include <astaroth.h> #include <astaroth.h>
AcReal AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, const AcReal* vtxbuf, AcReal* scratchpad,
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result); AcReal* reduce_result);
AcReal AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end, const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad, const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
AcReal* reduce_result);

View File

@@ -547,13 +547,14 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
for (int i = 0; i < node->num_devices; ++i) { for (int i = 0; i < node->num_devices; ++i) {
// OLD: ambiguous behaviour, transferred also halos between devices and assumed // OLD: ambiguous behaviour, transferred also halos between devices and assumed
// that halos are in sync // that halos are in sync
//const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE // const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
//const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z}; // const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
// New: Transfer ghost zones, but do not transfer overlapping halos. // New: Transfer ghost zones, but do not transfer overlapping halos.
// DECOMPOSITION OFFSET HERE (d0 & d1) // DECOMPOSITION OFFSET HERE (d0 & d1)
int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z}; int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z};
int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, NGHOST + (i + 1) * node->subgrid.n.z}; int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y,
NGHOST + (i + 1) * node->subgrid.n.z};
if (i == 0) if (i == 0)
d0.z = 0; d0.z = 0;
if (i == node->num_devices - 1) if (i == node->num_devices - 1)