Added a function acAutoOptimize to the interface and removed rk3_step_async in kernels.cuh (moved into rkStep)

2019-07-09 14:21:22 +03:00
parent 84d96de42b
commit a086821e7c
4 changed files with 38 additions and 29 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -610,3 +610,9 @@ acLoadDeviceConstant(const AcRealParam param, const AcReal value)
    }
    return AC_SUCCESS;
 }
+
+AcResult
+acAutoOptimize(void)
+{
+    return autoOptimize(devices[0]);
+}
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -270,7 +270,23 @@ rkStep(const Device device, const StreamType stream_type, const int step_number,
       const int3& end, const AcReal dt)
 {
    cudaSetDevice(device->id);
-    rk3_step_async(device->streams[stream_type], step_number, start, end, dt, &device->vba);
+
+    const dim3 tpb(32, 1, 4);
+
+    const int3 n = end - start;
+    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
+    else
+        solve<2><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
+
+    ERRCHK_CUDA_KERNEL();
+
    return AC_SUCCESS;
 }

@@ -395,6 +411,14 @@ loadGlobalGrid(const Device device, const Grid grid)
    return AC_SUCCESS;
 }

+AcResult
+autoOptimize(const Device device)
+{
+    cudaSetDevice(device->id);
+
+    return AC_SUCCESS;
+}
+
 #if PACKED_DATA_TRANSFERS
 // Functions for calling packed data transfers
 #endif
--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -99,6 +99,9 @@ AcResult loadDeviceConstant(const Device device, const AcRealParam param, const
 /** */
 AcResult loadGlobalGrid(const Device device, const Grid grid);

+/** */
+AcResult autoOptimize(const Device device);
+
 // #define PACKED_DATA_TRANSFERS (1) %JP: placeholder for optimized ghost zone packing and transfers
 #if PACKED_DATA_TRANSFERS
 // Declarations used for packed data transfers
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -707,30 +707,6 @@ read_out(const int idx, AcReal* __restrict__ field[], const int3 handle)
 * =============================================================================
 */

-AcResult
-rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start,
-               const int3& end, const AcReal dt, VertexBufferArray* buffer)
-{
-    const dim3 tpb(32, 1, 4);
-
-    const int nx = end.x - start.x;
-    const int ny = end.y - start.y;
-    const int nz = end.z - start.z;
-
-    const dim3 bpg((unsigned int)ceil(nx / AcReal(tpb.x)), (unsigned int)ceil(ny / AcReal(tpb.y)),
-                   (unsigned int)ceil(nz / AcReal(tpb.z)));
-
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-    else
-        solve<2><<<bpg, tpb, 0, stream>>>(start, end, *buffer, dt);
-
-    ERRCHK_CUDA_KERNEL();
-    return AC_SUCCESS;
-}
-
 ////////////////REDUCE///////////////////////////
 #include "src/core/math_utils.h" // is_power_of_two