From 10a98b01a93c8e43475b57b6fe5f351e7ebc0ab8 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Tue, 9 Jul 2019 14:46:24 +0300
Subject: [PATCH] Experimental change: now the integration function is
 automatically optimized during acInit

---
 include/astaroth.h          |  5 ++-
 src/core/device.cu          | 77 ++++++++++++++++++++++++++++++++++++-
 src/standalone/benchmark.cc |  3 ++
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/include/astaroth.h b/include/astaroth.h
index 01d7558..9c91448 100644
--- a/include/astaroth.h
+++ b/include/astaroth.h
@@ -55,7 +55,6 @@ extern "C" {
 #define REGISTERS_PER_THREAD (255)
 #define MAX_REGISTERS_PER_BLOCK (65536)
 #define MAX_THREADS_PER_BLOCK (1024)
-#define MAX_TB_DIM (MAX_THREADS_PER_BLOCK)
 #define NUM_ITERATIONS (10)
 #define WARP_SIZE (32)
 /*
@@ -275,6 +274,10 @@ AcResult acSynchronize(void);
 /** Loads a parameter to the constant memory of all devices */
 AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value);
 
+/** Auto-optimizes the library. This function is free from side-effects: the input vertex buffer is
+ * guaranteed not be modified.*/
+AcResult acAutoOptimize(void);
+
 /* End extern "C" */
 #ifdef __cplusplus
 }
diff --git a/src/core/device.cu b/src/core/device.cu
index cb2c732..78b2154 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -42,6 +42,8 @@ __constant__ Grid globalGrid;
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
 #include "kernels/kernels.cuh"
 
+static dim3 rk3_tpb = (dim3){32, 1, 4};
+
 #if PACKED_DATA_TRANSFERS // Defined in device.cuh
 // #include "kernels/pack_unpack.cuh"
 #endif
@@ -183,6 +185,11 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle
 
     printf("Created device %d (%p)\n", device->id, device);
     *device_handle = device;
+
+    // Autoptimize
+    if (id == 0)
+        autoOptimize(device);
+
     return AC_SUCCESS;
 }
 
@@ -271,7 +278,8 @@ rkStep(const Device device, const StreamType stream_type, const int step_number,
 {
     cudaSetDevice(device->id);
 
-    const dim3 tpb(32, 1, 4);
+    // const dim3 tpb(32, 1, 4);
+    const dim3 tpb = rk3_tpb;
 
     const int3 n = end - start;
     const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
@@ -416,6 +424,73 @@ autoOptimize(const Device device)
 {
     cudaSetDevice(device->id);
 
+    // RK3
+    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
+    const int3 end   = start + (int3){device->local_config.int_params[AC_nx], //
+                                    device->local_config.int_params[AC_ny], //
+                                    device->local_config.int_params[AC_nz]};
+
+    dim3 best_dims(0, 0, 0);
+    float best_time = INFINITY;
+
+    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
+        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
+            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
+
+                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
+                    break;
+                if (x * y * z > MAX_THREADS_PER_BLOCK)
+                    break;
+
+                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
+                    break;
+
+                if (((x * y * z) % WARP_SIZE) != 0)
+                    continue;
+
+                const dim3 tpb(x, y, z);
+                const int3 n = end - start;
+                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+                cudaDeviceSynchronize();
+                if (cudaGetLastError() != cudaSuccess) // resets the error if any
+                    continue;
+
+                printf("(%d, %d, %d)\n", x, y, z);
+
+                cudaEvent_t tstart, tstop;
+                cudaEventCreate(&tstart);
+                cudaEventCreate(&tstop);
+
+                cudaEventRecord(tstart); // ---------------------------------------- Timing start
+
+                for (int i = 0; i < NUM_ITERATIONS; ++i)
+                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
+
+                cudaEventRecord(tstop); // ----------------------------------------- Timing end
+                cudaEventSynchronize(tstop);
+                float milliseconds = 0;
+                cudaEventElapsedTime(&milliseconds, tstart, tstop);
+
+                ERRCHK_CUDA_KERNEL_ALWAYS();
+                if (milliseconds < best_time) {
+                    best_time = milliseconds;
+                    best_dims = tpb;
+                }
+            }
+        }
+    }
+
+    printf("Best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z,
+           double(best_time) / NUM_ITERATIONS);
+    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
+    ERRCHK(fp);
+    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
+    fclose(fp);
+
+    rk3_tpb = best_dims;
     return AC_SUCCESS;
 }
 
diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc
index 468ef29..03acd63 100644
--- a/src/standalone/benchmark.cc
+++ b/src/standalone/benchmark.cc
@@ -116,6 +116,9 @@ run_benchmark(void)
         std::vector<double> results;
         results.reserve(NUM_ITERS);
 
+        // Optimize
+        // acAutoOptimize();
+
         // Warmup
         for (int i = 0; i < 10; ++i) {
             acIntegrate(0);