Reformatted

2019-12-03 15:14:26 +02:00
parent f14e35620c
commit 5a6a3110df
10 changed files with 37 additions and 52 deletions
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -26,7 +26,6 @@ extern "C" {
 #include <stdlib.h> // size_t
 //#include <vector_types.h> // CUDA vector types (float4, etc)
 //#ifndef __CUDACC__
 #if defined(AC_USE_CUDA_RUNTIME_API) || defined(__CUDACC__)
 #include <cuda_runtime_api.h>
 #else
@@ -46,7 +45,6 @@ typedef struct {
    double x, y, z;
 } double3;
 #endif
 //#endif // __CUDACC__
 // Library flags
 #define STENCIL_ORDER (6)
--- a/src/core/device.cc
+++ b/src/core/device.cc
@@ -26,8 +26,8 @@
 */
 #include "astaroth_device.h"
 #include "math_utils.h"
 #include "errchk.h"
 #include "math_utils.h"
 #include "kernels/common.cuh"
@@ -105,8 +105,8 @@ acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_hand
    }
    // Reductions
-    ERRCHK_CUDA_ALWAYS(
+    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_scratchpad,
-        cudaMalloc((void**)&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
+                                  acVertexBufferCompdomainSizeBytes(device_config)));
    ERRCHK_CUDA_ALWAYS(cudaMalloc((void**)&device->reduce_result, sizeof(AcReal)));
 #if AC_MPI_ENABLED
@@ -242,7 +242,7 @@ acDeviceAutoOptimize(const Device device)
 {
    cudaSetDevice(device->id);
    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end = (int3){device->local_config.int_params[AC_mx], //
+    const int3 end   = (int3){device->local_config.int_params[AC_mx], //
                            device->local_config.int_params[AC_my], //
                            device->local_config.int_params[AC_mz]};
    return acKernelAutoOptimizeIntegration(start, end, device->vba);
@@ -528,7 +528,8 @@ acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
                              const int3 end)
 {
    cudaSetDevice(device->id);
-    return acKernelPeriodicBoundconds(device->streams[stream], start, end, device->vba.in[vtxbuf_handle]);
+    return acKernelPeriodicBoundconds(device->streams[stream], start, end,
                                      device->vba.in[vtxbuf_handle]);
 }
 AcResult
@@ -555,8 +556,9 @@ acDeviceReduceScal(const Device device, const Stream stream, const ReductionType
                            device->local_config.int_params[AC_ny_max],
                            device->local_config.int_params[AC_nz_max]};
-    *result = acKernelReduceScal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
+    *result = acKernelReduceScal(device->streams[stream], rtype, start, end,
-                          device->reduce_scratchpad, device->reduce_result);
+                                 device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
                                 device->reduce_result);
    return AC_SUCCESS;
 }
@@ -576,8 +578,8 @@ acDeviceReduceVec(const Device device, const Stream stream, const ReductionType
                            device->local_config.int_params[AC_nz_max]};
    *result = acKernelReduceVec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
-                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
+                                device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
-                         device->reduce_scratchpad, device->reduce_result);
+                                device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
 }
--- a/src/core/kernels/boundconds.cu
+++ b/src/core/kernels/boundconds.cu
@@ -76,7 +76,8 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
 }
 AcResult
-acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
+acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
                           AcReal* vtxbuf)
 {
    const dim3 tpb(8, 2, 8);
    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
--- a/src/core/kernels/boundconds.cuh
+++ b/src/core/kernels/boundconds.cuh
@@ -28,5 +28,5 @@
 #include "astaroth.h"
-AcResult
+AcResult acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end,
-acKernelPeriodicBoundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf);
+                                    AcReal* vtxbuf);
--- a/src/core/kernels/common.cuh
+++ b/src/core/kernels/common.cuh
@@ -72,9 +72,6 @@ DCONST(const VertexBufferHandle handle)
 #define d_multigpu_offset (d_mesh_info.int3_params[AC_multigpu_offset])
 //#define d_multinode_offset (d_mesh_info.int3_params[AC_multinode_offset]) // Placeholder
 static __device__ constexpr int
 IDX(const int i)
 {
@@ -93,19 +90,9 @@ IDX(const int3 idx)
    return DEVICE_VTXBUF_IDX(idx.x, idx.y, idx.z);
 }
 //#include <thrust/complex.h>
 // using namespace thrust;
 #include <cuComplex.h>
 #if AC_DOUBLE_PRECISION == 1
 typedef cuDoubleComplex acComplex;
--- a/src/core/kernels/integration.cu
+++ b/src/core/kernels/integration.cu
@@ -128,11 +128,11 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
    if (vertexIdx.x >= end.x || vertexIdx.y >= end.y || vertexIdx.z >= end.z)                      \
        return;                                                                                    \
                                                                                                   \
-    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&           \
+    assert(vertexIdx.x < DCONST(AC_nx_max) && vertexIdx.y < DCONST(AC_ny_max) &&                   \
-           vertexIdx.z < DCONST(AC_nz_max));                                                   \
+           vertexIdx.z < DCONST(AC_nz_max));                                                       \
                                                                                                   \
-    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&         \
+    assert(vertexIdx.x >= DCONST(AC_nx_min) && vertexIdx.y >= DCONST(AC_ny_min) &&                 \
-           vertexIdx.z >= DCONST(AC_nz_min));                                                  \
+           vertexIdx.z >= DCONST(AC_nz_min));                                                      \
                                                                                                   \
    const int idx = IDX(vertexIdx.x, vertexIdx.y, vertexIdx.z);
@@ -293,4 +293,3 @@ acKernelDummy(void)
    ERRCHK_CUDA_KERNEL_ALWAYS();
    return AC_SUCCESS;
 }
--- a/src/core/kernels/integration.cuh
+++ b/src/core/kernels/integration.cuh
@@ -26,11 +26,9 @@
 */
 #pragma once
-AcResult
+AcResult acKernelDummy(void);
 acKernelDummy(void);
-AcResult
+AcResult acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
 acKernelAutoOptimizeIntegration(const int3 start, const int3 end, VertexBufferArray vba);
-AcResult
+AcResult acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number,
-acKernelIntegrateSubstep(const cudaStream_t stream, const int step_number, const int3 start, const int3 end, VertexBufferArray vba);
+                                  const int3 start, const int3 end, VertexBufferArray vba);
--- a/src/core/kernels/reductions.cu
+++ b/src/core/kernels/reductions.cu
@@ -174,7 +174,7 @@ kernel_reduce_block(const __restrict__ AcReal* scratchpad, const int num_blocks,
 AcReal
 acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
+                   const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result)
 {
    const unsigned nx        = end.x - start.x;
    const unsigned ny        = end.y - start.y;
@@ -227,9 +227,9 @@ acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const i
 }
 AcReal
-acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
+acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
+                  const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
-           AcReal* reduce_result)
+                  const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result)
 {
    const unsigned nx        = end.x - start.x;
    const unsigned ny        = end.y - start.y;
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -27,11 +27,10 @@
 #pragma once
 #include <astaroth.h>
-AcReal
+AcReal acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-acKernelReduceScal(const cudaStream_t stream, const ReductionType rtype, const int3& start,
+                          const int3& end, const AcReal* vtxbuf, AcReal* scratchpad,
-            const int3& end, const AcReal* vtxbuf, AcReal* scratchpad, AcReal* reduce_result);
+                          AcReal* reduce_result);
-AcReal
+AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start,
-acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const int3& start, const int3& end,
+                         const int3& end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
-           const AcReal* vtxbuf0, const AcReal* vtxbuf1, const AcReal* vtxbuf2, AcReal* scratchpad,
+                         const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
           AcReal* reduce_result);
--- a/src/core/node.cc
+++ b/src/core/node.cc
@@ -547,13 +547,14 @@ acNodeStoreVertexBufferWithOffset(const Node node, const Stream stream,
    for (int i = 0; i < node->num_devices; ++i) {
        // OLD: ambiguous behaviour, transferred also halos between devices and assumed
        // that halos are in sync
-        //const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
+        // const int3 d0 = (int3){0, 0, i * node->subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        //const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
+        // const int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, d0.z + node->subgrid.m.z};
        // New: Transfer ghost zones, but do not transfer overlapping halos.
        // DECOMPOSITION OFFSET HERE (d0 & d1)
        int3 d0 = (int3){0, 0, NGHOST + i * node->subgrid.n.z};
-        int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y, NGHOST + (i + 1) * node->subgrid.n.z};
+        int3 d1 = (int3){node->subgrid.m.x, node->subgrid.m.y,
                         NGHOST + (i + 1) * node->subgrid.n.z};
        if (i == 0)
            d0.z = 0;
        if (i == node->num_devices - 1)