From 10a98b01a93c8e43475b57b6fe5f351e7ebc0ab8 Mon Sep 17 00:00:00 2001 From: jpekkila Date: Tue, 9 Jul 2019 14:46:24 +0300 Subject: [PATCH] Experimental change: now the integration function is automatically optimized during acInit --- include/astaroth.h | 5 ++- src/core/device.cu | 77 ++++++++++++++++++++++++++++++++++++- src/standalone/benchmark.cc | 3 ++ 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/include/astaroth.h b/include/astaroth.h index 01d7558..9c91448 100644 --- a/include/astaroth.h +++ b/include/astaroth.h @@ -55,7 +55,6 @@ extern "C" { #define REGISTERS_PER_THREAD (255) #define MAX_REGISTERS_PER_BLOCK (65536) #define MAX_THREADS_PER_BLOCK (1024) -#define MAX_TB_DIM (MAX_THREADS_PER_BLOCK) #define NUM_ITERATIONS (10) #define WARP_SIZE (32) /* @@ -275,6 +274,10 @@ AcResult acSynchronize(void); /** Loads a parameter to the constant memory of all devices */ AcResult acLoadDeviceConstant(const AcRealParam param, const AcReal value); +/** Auto-optimizes the library. This function is free from side-effects: the input vertex buffer is + * guaranteed not be modified.*/ +AcResult acAutoOptimize(void); + /* End extern "C" */ #ifdef __cplusplus } diff --git a/src/core/device.cu b/src/core/device.cu index cb2c732..78b2154 100644 --- a/src/core/device.cu +++ b/src/core/device.cu @@ -42,6 +42,8 @@ __constant__ Grid globalGrid; #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy)) #include "kernels/kernels.cuh" +static dim3 rk3_tpb = (dim3){32, 1, 4}; + #if PACKED_DATA_TRANSFERS // Defined in device.cuh // #include "kernels/pack_unpack.cuh" #endif @@ -183,6 +185,11 @@ createDevice(const int id, const AcMeshInfo device_config, Device* device_handle printf("Created device %d (%p)\n", device->id, device); *device_handle = device; + + // Autoptimize + if (id == 0) + autoOptimize(device); + return AC_SUCCESS; } @@ -271,7 +278,8 @@ rkStep(const Device device, const StreamType stream_type, const int step_number, { cudaSetDevice(device->id); - const dim3 tpb(32, 1, 4); + // const dim3 tpb(32, 1, 4); + const dim3 tpb = rk3_tpb; const int3 n = end - start; const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // @@ -416,6 +424,73 @@ autoOptimize(const Device device) { cudaSetDevice(device->id); + // RK3 + const int3 start = (int3){NGHOST, NGHOST, NGHOST}; + const int3 end = start + (int3){device->local_config.int_params[AC_nx], // + device->local_config.int_params[AC_ny], // + device->local_config.int_params[AC_nz]}; + + dim3 best_dims(0, 0, 0); + float best_time = INFINITY; + + for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) { + for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) { + for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) { + + if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z) + break; + if (x * y * z > MAX_THREADS_PER_BLOCK) + break; + + if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK) + break; + + if (((x * y * z) % WARP_SIZE) != 0) + continue; + + const dim3 tpb(x, y, z); + const int3 n = end - start; + const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), // + (unsigned int)ceil(n.y / AcReal(tpb.y)), // + (unsigned int)ceil(n.z / AcReal(tpb.z))); + + cudaDeviceSynchronize(); + if (cudaGetLastError() != cudaSuccess) // resets the error if any + continue; + + printf("(%d, %d, %d)\n", x, y, z); + + cudaEvent_t tstart, tstop; + cudaEventCreate(&tstart); + cudaEventCreate(&tstop); + + cudaEventRecord(tstart); // ---------------------------------------- Timing start + + for (int i = 0; i < NUM_ITERATIONS; ++i) + solve<2><<>>(start, end, device->vba, FLT_EPSILON); + + cudaEventRecord(tstop); // ----------------------------------------- Timing end + cudaEventSynchronize(tstop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, tstart, tstop); + + ERRCHK_CUDA_KERNEL_ALWAYS(); + if (milliseconds < best_time) { + best_time = milliseconds; + best_dims = tpb; + } + } + } + } + + printf("Best dims (%d, %d, %d) %f ms\n", best_dims.x, best_dims.y, best_dims.z, + double(best_time) / NUM_ITERATIONS); + FILE* fp = fopen("../config/rk3_tbdims.cuh", "w"); + ERRCHK(fp); + fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z); + fclose(fp); + + rk3_tpb = best_dims; return AC_SUCCESS; } diff --git a/src/standalone/benchmark.cc b/src/standalone/benchmark.cc index 468ef29..03acd63 100644 --- a/src/standalone/benchmark.cc +++ b/src/standalone/benchmark.cc @@ -116,6 +116,9 @@ run_benchmark(void) std::vector results; results.reserve(NUM_ITERS); + // Optimize + // acAutoOptimize(); + // Warmup for (int i = 0; i < 10; ++i) { acIntegrate(0);