Reformatted
This commit is contained in:
@@ -26,7 +26,6 @@ extern "C" {
|
|||||||
#include <stdlib.h> // size_t
|
#include <stdlib.h> // size_t
|
||||||
//#include <vector_types.h> // CUDA vector types (float4, etc)
|
//#include <vector_types.h> // CUDA vector types (float4, etc)
|
||||||
|
|
||||||
//#ifndef __CUDACC__
|
|
||||||
#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
|
#if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#else
|
#else
|
||||||
@@ -46,7 +45,6 @@ typedef struct {
|
|||||||
double x, y, z;
|
double x, y, z;
|
||||||
} double3;
|
} double3;
|
||||||
#endif
|
#endif
|
||||||
//#endif // __CUDACC__
|
|
||||||
|
|
||||||
// Library flags
|
// Library flags
|
||||||
#define STENCIL_ORDER (6)
|
#define STENCIL_ORDER (6)
|
||||||
|
@@ -26,8 +26,8 @@
|
|||||||
*/
|
*/
|
||||||
#include "astaroth_device.h"
|
#include "astaroth_device.h"
|
||||||
|
|
||||||
#include "math_utils.h"
|
|
||||||
#include "errchk.h"
|
#include "errchk.h"
|
||||||
|
#include "math_utils.h"
|
||||||
|
|
||||||
#include "kernels/common.cuh"
|
#include "kernels/common.cuh"
|
||||||
|
|
||||||
@@ -105,8 +105,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Reductions
|
// Reductions
|
||||||
ERRCHK_CUDA_ALWAYS(
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad,
|
||||||
cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
|
acVertexBufferCompdomainSizeBytes(device_config)));
|
||||||
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
|
ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
|
||||||
|
|
||||||
#if AC_MPI_ENABLED
|
#if AC_MPI_ENABLED
|
||||||
@@ -528,7 +528,8 @@ acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
|
|||||||
const int3 end)
|
const int3 end)
|
||||||
{
|
{
|
||||||
cudaSetDevice(device->id);
|
cudaSetDevice(device->id);
|
||||||
return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]);
|
return acKernelPeriodicBoundconds(device->streams[stream], start, end,
|
||||||
|
device->vba.in[vtxbuf_handle]);
|
||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
@@ -555,8 +556,9 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType
|
|||||||
device->local_config.int_params[AC_ny_max],
|
device->local_config.int_params[AC_ny_max],
|
||||||
device->local_config.int_params[AC_nz_max]};
|
device->local_config.int_params[AC_nz_max]};
|
||||||
|
|
||||||
*result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
|
*result = acKernelReduceScal(device->streams[stream], rtype, start, end,
|
||||||
device->reduce_scratchpad, device->reduce_result);
|
device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
|
||||||
|
device->reduce_result);
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -76,7 +76,8 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcResult
|
AcResult
|
||||||
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
|
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
|
||||||
|
AcReal* vtxbuf)
|
||||||
{
|
{
|
||||||
const dim3 tpb(8, 2, 8);
|
const dim3 tpb(8, 2, 8);
|
||||||
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
|
||||||
|
@@ -28,5 +28,5 @@
|
|||||||
|
|
||||||
#include "astaroth.h"
|
#include "astaroth.h"
|
||||||
|
|
||||||
AcResult
|
AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
|
||||||
acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf);
|
AcReal* vtxbuf);
|
||||||
|
@@ -72,9 +72,6 @@ DCONST(const VertexBufferHandle handle)
|
|||||||
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
|
#define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
|
||||||
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
|
//#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static __device__ constexpr int
|
static __device__ constexpr int
|
||||||
IDX(const int i)
|
IDX(const int i)
|
||||||
{
|
{
|
||||||
@@ -93,19 +90,9 @@ IDX(const int3 idx)
|
|||||||
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
|
return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//#include <thrust/complex.h>
|
//#include <thrust/complex.h>
|
||||||
// using namespace thrust;
|
// using namespace thrust;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#include <cuComplex.h>
|
#include <cuComplex.h>
|
||||||
#if AC_DOUBLE_PRECISION == 1
|
#if AC_DOUBLE_PRECISION == 1
|
||||||
typedef cuDoubleComplex acComplex;
|
typedef cuDoubleComplex acComplex;
|
||||||
|
@@ -293,4 +293,3 @@ acKernelDummy(void)
|
|||||||
ERRCHK_CUDA_KERNEL_ALWAYS();
|
ERRCHK_CUDA_KERNEL_ALWAYS();
|
||||||
return AC_SUCCESS;
|
return AC_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -26,11 +26,9 @@
|
|||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
AcResult
|
AcResult acKernelDummy(void);
|
||||||
acKernelDummy(void);
|
|
||||||
|
|
||||||
AcResult
|
AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
|
||||||
acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
|
|
||||||
|
|
||||||
AcResult
|
AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
|
||||||
acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba);
|
const int3 start, const int3 end, VertexBufferArray vba);
|
||||||
|
@@ -227,9 +227,9 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i
|
|||||||
}
|
}
|
||||||
|
|
||||||
AcReal
|
AcReal
|
||||||
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
|
||||||
AcReal* reduce_result)
|
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
|
||||||
{
|
{
|
||||||
const unsigned nx = end.x - start.x;
|
const unsigned nx = end.x - start.x;
|
||||||
const unsigned ny = end.y - start.y;
|
const unsigned ny = end.y - start.y;
|
||||||
|
@@ -27,11 +27,10 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <astaroth.h>
|
#include <astaroth.h>
|
||||||
|
|
||||||
AcReal
|
AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||||
acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad,
|
||||||
const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result);
|
|
||||||
|
|
||||||
AcReal
|
|
||||||
acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
|
|
||||||
const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
|
|
||||||
AcReal* reduce_result);
|
AcReal* reduce_result);
|
||||||
|
|
||||||
|
AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
|
||||||
|
const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
|
||||||
|
const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
|
||||||
|
@@ -553,7 +553,8 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
|
|||||||
// New: Transfer ghost zones, but do not transfer overlapping halos.
|
// New: Transfer ghost zones, but do not transfer overlapping halos.
|
||||||
// DECOMPOSITION OFFSET HERE (d0 & d1)
|
// DECOMPOSITION OFFSET HERE (d0 & d1)
|
||||||
int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z};
|
int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z};
|
||||||
int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, NGHOST + (i + 1) * node->subgrid.n.z};
|
int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y,
|
||||||
|
NGHOST + (i + 1) * node->subgrid.n.z};
|
||||||
if (i == 0)
|
if (i == 0)
|
||||||
d0.z = 0;
|
d0.z = 0;
|
||||||
if (i == node->num_devices - 1)
|
if (i == node->num_devices - 1)
|
||||||
|
Reference in New Issue
Block a user