Various intermediate changes

2019-07-31 17:48:48 +03:00
parent 6b55fce54a
commit 5be775dbff
8 changed files with 43 additions and 1185 deletions
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -103,21 +103,18 @@ AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stre
 AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device);
 /** */
-AcResult acDeviceIntegrateSubstep(const Device device, const StreamType stream_type,
+AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
-                                  const int step_number, const int3 start, const int3 end,
+                                  const int3 start, const int3 end, const AcReal dt);
                                  const AcReal dt);
 /** */
-AcResult acDevicePeriodicBoundcondStep(const Device device, const StreamType stream_type,
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const int3 start,
-                                       const int3 start, const int3 end);
+                                       const int3 end);
 /** */
-AcResult acDeviceReduceScal(const Device device, const StreamType stream_type,
+AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
-                            const ReductionType rtype, const VertexBufferHandle vtxbuf_handle,
+                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
                            AcReal* result);
 /** */
-AcResult acDeviceReduceVec(const Device device, const StreamType stream_type,
+AcResult acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
-                           const ReductionType rtype, const VertexBufferHandle vec0,
+                           const VertexBufferHandle vec0, const VertexBufferHandle vec1,
-                           const VertexBufferHandle vec1, const VertexBufferHandle vec2,
+                           const VertexBufferHandle vec2, AcReal* result);
                           AcReal* result);
 #ifdef __cplusplus
 } // extern "C"
--- a/include/astaroth_grid.h
+++ b/include/astaroth_grid.h
@@ -89,16 +89,15 @@ AcResult acGridTransferVertexBuffer(const Stream stream, const VertexBufferHandl
 AcResult acGridTransferMesh(const Stream stream);
 /** */
-AcResult acGridIntegrateSubstep(const StreamType stream_type, const int step_number,
+AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start,
-                                const int3 start, const int3 end, const AcReal dt);
+                                const int3 end, const AcReal dt);
 /** */
-AcResult acGridPeriodicBoundcondStep(const StreamType stream_type, const int3 start,
+AcResult acGridPeriodicBoundcondStep(const Stream stream, const int3 start, const int3 end);
                                     const int3 end);
 /** */
-AcResult acGridReduceScal(const StreamType stream_type, const ReductionType rtype,
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acGridReduceVec(const StreamType stream_type, const ReductionType rtype,
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
                         const VertexBufferHandle vec0, const VertexBufferHandle vec1,
                         const VertexBufferHandle vec2, AcReal* result);
--- a/include/astaroth_node.h
+++ b/include/astaroth_node.h
@@ -104,17 +104,16 @@ AcResult acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
 AcResult acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node);
 /** */
-AcResult acNodeIntegrateSubstep(const Node node, const StreamType stream_type,
+AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
-                                const int step_number, const int3 start, const int3 end,
+                                const int3 start, const int3 end, const AcReal dt);
                                const AcReal dt);
 /** */
-AcResult acNodePeriodicBoundcondStep(const Node node, const StreamType stream_type,
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, const int3 start,
-                                     const int3 start, const int3 end);
+                                     const int3 end);
 /** */
-AcResult acNodeReduceScal(const Node node, const StreamType stream_type, const ReductionType rtype,
+AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acNodeReduceVec(const Node node, const StreamType stream_type, const ReductionType rtype,
+AcResult acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
                         const VertexBufferHandle vec0, const VertexBufferHandle vec1,
                         const VertexBufferHandle vec2, AcReal* result);
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -3,7 +3,7 @@
 ########################################
 ## Find packages
-find_package(CUDA 9 REQUIRED)
+find_package(CUDA REQUIRED)
 ## Architecture and optimization flags
 set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -16,660 +16,4 @@
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-
+#include "astaroth_defines.h"
 /**
 * @file
 * \brief Multi-GPU implementation.
 *
 %JP: The old way for computing boundary conditions conflicts with the
 way we have to do things with multiple GPUs.
 The older approach relied on unified memory, which represented the whole
 memory area as one huge mesh instead of several smaller ones. However, unified memory
 in its current state is more meant for quick prototyping when performance is not an issue.
 Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
 than when managing the memory explicitly.
 In this new approach, I have simplified the multi- and single-GPU layers significantly.
 Quick rundown:
         New struct: Grid. There are two global variables, "grid" and "subgrid", which
         contain the extents of the whole simulation domain and the decomposed grids,
 respectively. To simplify thing, we require that each GPU is assigned the same amount of
 work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
 work with.
         The whole simulation domain is decomposed with respect to the z dimension.
         For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
         contain (nx, ny, nz / num_devices) vertices.
         An local index (i, j, k) in some subgrid can be mapped to the global grid with
                 global idx = (i, j, k + device_id * subgrid.n.z)
 Terminology:
         - Single-GPU function: a function defined on the single-GPU layer (device.cu)
 Changes required to this commented code block:
         - The thread block dimensions (tpb) are no longer passed to the kernel here but in
 device.cu instead. Same holds for any complex index calculations. Instead, the local
 coordinates should be passed as an int3 type without having to consider how the data is
 actually laid out in device memory
         - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
 handle of type "Device" which should be passed to single-GPU functions. In this file, all
 devices are stored in a global array "devices[num_devices]".
         - Every single-GPU function is executed asynchronously by default such that we
           can optimize Astaroth by executing memory transactions concurrently with
 computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
           Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
           as a parameter and commands executing in different streams can be processed
           in parallel/concurrently.
 Note on periodic boundaries (might be helpful when implementing other boundary conditions):
         With multiple GPUs, periodic boundary conditions applied on indices ranging from
                 (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
 STENCIL_ORDER/2)
         on a single device are "local", in the sense that they can be computed without
 having to exchange data with neighboring GPUs. Special care is needed only for transferring
         the data to the fron and back plates outside this range. In the solution we use
 here, we solve the local boundaries first, and then just exchange the front and back plates
         in a "ring", like so
                                 device_id
                     (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
 ### Throughout this file we use the following notation and names for various index offsets
    Global coordinates: coordinates with respect to the global grid (static Grid grid)
    Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
    s0, s1: source indices in global coordinates
    d0, d1: destination indices in global coordinates
    da = max(s0, d0);
    db = min(s1, d1);
    These are used in at least
    acLoad()
    acStore()
    acSynchronizeHalos()
     Here we decompose the host mesh and distribute it among the GPUs in
     the node.
     The host mesh is a huge contiguous block of data. Its dimensions are given by
     the global variable named "grid". A "grid" is decomposed into "subgrids",
     one for each GPU. Here we check which parts of the range s0...s1 maps
     to the memory space stored by some GPU, ranging d0...d1, and transfer
     the data if needed.
     The index mapping is inherently quite involved, but here's a picture which
     hopefully helps make sense out of all this.
     Grid
                                      |----num_vertices---|
     xxx|....................................................|xxx
              ^                   ^   ^                   ^
             d0                  d1  s0 (src)            s1
     Subgrid
              xxx|.............|xxx
              ^                   ^
             d0                  d1
                                  ^   ^
                                 db  da
 *
 */
 #include "astaroth.h"
 #include "errchk.h"
 #include "device.cuh"
 #include "math_utils.h" // sum for reductions
 // #include "standalone/config_loader.h" // update_config
 #define AC_GEN_STR(X) #X
 const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
 const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
 const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
 const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
 const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
 #undef AC_GEN_STR
 static const int MAX_NUM_DEVICES       = 32;
 static int num_devices                 = 0;
 static Device devices[MAX_NUM_DEVICES] = {};
 static Grid grid; // A grid consists of num_devices subgrids
 static Grid subgrid;
 static int
 gridIdx(const Grid grid, const int3 idx)
 {
    return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
 }
 static int3
 gridIdx3d(const Grid grid, const int idx)
 {
    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
                  idx / (grid.m.x * grid.m.y)};
 }
 static void
 printInt3(const int3 vec)
 {
    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
 }
 static inline void
 print(const AcMeshInfo config)
 {
    for (int i = 0; i < NUM_INT_PARAMS; ++i)
        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
 }
 static void
 update_builtin_params(AcMeshInfo* config)
 {
    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
    ///////////// PAD TEST
    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
    ///////////// PAD TEST
    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
    config->int_params[AC_nx_min] = NGHOST;
    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
    config->int_params[AC_ny_min] = NGHOST;
    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
    config->int_params[AC_nz_min] = NGHOST;
    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
    /* Additional helper params */
    // Int helpers
    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
 #if VERBOSE_PRINTING // Defined in astaroth.h
    printf("###############################################################\n");
    printf("Config dimensions recalculated:\n");
    print(*config);
    printf("###############################################################\n");
 #endif
 }
 static Grid
 createGrid(const AcMeshInfo config)
 {
    Grid grid;
    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
    return grid;
 }
 AcResult
 acCheckDeviceAvailability(void)
 {
    int device_count; // Separate from num_devices to avoid side effects
    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&device_count));
    if (device_count > 0)
        return AC_SUCCESS;
    else
        return AC_FAILURE;
 }
 AcResult
 acSynchronizeStream(const StreamType stream)
 {
    // #pragma omp parallel for
    for (int i = 0; i < num_devices; ++i) {
        synchronize(devices[i], stream);
    }
    return AC_SUCCESS;
 }
 static AcResult
 synchronize_halos(const StreamType stream)
 {
    // Exchanges the halos of subgrids
    // After this step, the data within the main grid ranging from
    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
    // has been synchronized and transferred to appropriate subgrids
    // We loop only to num_devices - 1 since the front and back plate of the grid is not
    // transferred because their contents depend on the boundary conditions.
    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
    // I.e. the halos of subgrids must contain up-to-date data!
    // #pragma omp parallel for
    for (int i = 0; i < num_devices - 1; ++i) {
        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
        // ...|ooooxxx|... -> xxx|ooooooo|...
        {
            const int3 src = (int3){0, 0, subgrid.n.z};
            const int3 dst = (int3){0, 0, 0};
            copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst,
                                   num_vertices);
        }
        // ...|ooooooo|xxx <- ...|xxxoooo|...
        {
            const int3 src = (int3){0, 0, NGHOST};
            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst,
                                   num_vertices);
        }
    }
    return AC_SUCCESS;
 }
 AcResult
 acSynchronizeMesh(void)
 {
    acSynchronizeStream(STREAM_ALL);
    synchronize_halos(STREAM_DEFAULT);
    acSynchronizeStream(STREAM_ALL);
    return AC_SUCCESS;
 }
 AcResult
 acInit(const AcMeshInfo config)
 {
    // Get num_devices
    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
    if (num_devices < 1) {
        ERROR("No CUDA devices found!");
        return AC_FAILURE;
    }
    if (num_devices > MAX_NUM_DEVICES) {
        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
        num_devices = MAX_NUM_DEVICES;
    }
    if (!AC_MULTIGPU_ENABLED) {
        WARNING("MULTIGPU_ENABLED was false. Using only one device");
        num_devices = 1; // Use only one device if multi-GPU is not enabled
    }
    // Check that num_devices is divisible with AC_nz. This makes decomposing the
    // problem domain to multiple GPUs much easier since we do not have to worry
    // about remainders
    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
    // Decompose the problem domain
    // The main grid
    grid = createGrid(config);
    // Subgrids
    AcMeshInfo subgrid_config = config;
    subgrid_config.int_params[AC_nz] /= num_devices;
    update_builtin_params(&subgrid_config);
    subgrid = createGrid(subgrid_config);
    // Periodic boundary conditions become weird if the system can "fold unto itself".
    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
 #if VERBOSE_PRINTING
    // clang-format off
    printf("Grid m ");   printInt3(grid.m);    printf("\n");
    printf("Grid n ");   printInt3(grid.n);    printf("\n");
    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
    // clang-format on
 #endif
    // Initialize the devices
    for (int i = 0; i < num_devices; ++i) {
        createDevice(i, subgrid_config, &devices[i]);
        loadGlobalGrid(devices[i], grid);
        printDeviceInfo(devices[i]);
    }
    // Enable peer access
    for (int i = 0; i < num_devices; ++i) {
        const int front = (i + 1) % num_devices;
        const int back  = (i - 1 + num_devices) % num_devices;
        int can_access_front, can_access_back;
        cudaDeviceCanAccessPeer(&can_access_front, i, front);
        cudaDeviceCanAccessPeer(&can_access_back, i, back);
 #if VERBOSE_PRINTING
        printf(
            "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
            i, front, can_access_front, back, can_access_back);
 #endif
        cudaSetDevice(i);
        if (can_access_front) {
            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
        }
        if (can_access_back) {
            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
        }
    }
    acSynchronizeStream(STREAM_ALL);
    return AC_SUCCESS;
 }
 AcResult
 acQuit(void)
 {
    acSynchronizeStream(STREAM_ALL);
    for (int i = 0; i < num_devices; ++i) {
        destroyDevice(devices[i]);
    }
    return AC_SUCCESS;
 }
 AcResult
 acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
                               const int3 end, const StreamType stream)
 {
    // See the beginning of the file for an explanation of the index mapping
    // #pragma omp parallel for
    for (int i = 0; i < num_devices; ++i) {
        // DECOMPOSITION OFFSET HERE
        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
        const int3 da = max(start, d0);
        const int3 db = min(end, d1);
        if (db.z >= da.z) {
            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
            rkStep(devices[i], stream, isubstep, da_local, db_local, dt);
        }
    }
    return AC_SUCCESS;
 }
 AcResult
 acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end)
 {
    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
 }
 AcResult
 acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream)
 {
    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
    const int3 end   = start + grid.n;
    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream);
 }
 AcResult
 acIntegrateStep(const int isubstep, const AcReal dt)
 {
    return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
 }
 static AcResult
 local_boundcondstep(const StreamType stream)
 {
    if (num_devices == 1) {
        boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m);
    }
    else {
        // Local boundary conditions
        // #pragma omp parallel for
        for (int i = 0; i < num_devices; ++i) {
            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
            boundcondStep(devices[i], stream, d0, d1);
        }
    }
    return AC_SUCCESS;
 }
 static AcResult
 global_boundcondstep(const StreamType stream)
 {
    if (num_devices > 1) {
        // With periodic boundary conditions we exchange the front and back plates of the
        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
        // ...|ooooxxx|... -> xxx|ooooooo|...
        {
            const int3 src = (int3){0, 0, subgrid.n.z};
            const int3 dst = (int3){0, 0, 0};
            copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst,
                                   num_vertices);
        }
        // ...|ooooooo|xxx <- ...|xxxoooo|...
        {
            const int3 src = (int3){0, 0, NGHOST};
            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
            copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst,
                                   num_vertices);
        }
    }
    return AC_SUCCESS;
 }
 AcResult
 acBoundcondStepAsync(const StreamType stream)
 {
    ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES);
    local_boundcondstep(stream);
    acSynchronizeStream(stream);
    global_boundcondstep(stream);
    synchronize_halos(stream);
    acSynchronizeStream(stream);
    return AC_SUCCESS;
 }
 AcResult
 acBoundcondStep(void)
 {
    return acBoundcondStepAsync(STREAM_DEFAULT);
 }
 static AcResult
 swap_buffers(void)
 {
    // #pragma omp parallel for
    for (int i = 0; i < num_devices; ++i) {
        swapBuffers(devices[i]);
    }
    return AC_SUCCESS;
 }
 AcResult
 acIntegrate(const AcReal dt)
 {
    acSynchronizeStream(STREAM_ALL);
    for (int isubstep = 0; isubstep < 3; ++isubstep) {
        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
        swap_buffers();
        acBoundcondStep();
    }
    return AC_SUCCESS;
 }
 static AcReal
 simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n)
 {
    AcReal res = results[0];
    for (int i = 1; i < n; ++i) {
        if (rtype == RTYPE_MAX) {
            res = max(res, results[i]);
        }
        else if (rtype == RTYPE_MIN) {
            res = min(res, results[i]);
        }
        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
            res = sum(res, results[i]);
        }
        else {
            ERROR("Invalid rtype");
        }
    }
    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
        res                = sqrt(inv_n * res);
    }
    return res;
 }
 AcReal
 acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle)
 {
    acSynchronizeStream(STREAM_ALL);
    AcReal results[num_devices];
    // #pragma omp parallel for
    for (int i = 0; i < num_devices; ++i) {
        reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
    }
    return simple_final_reduce_scal(rtype, results, num_devices);
 }
 AcReal
 acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b,
            const VertexBufferHandle c)
 {
    acSynchronizeStream(STREAM_ALL);
    AcReal results[num_devices];
    // #pragma omp parallel for
    for (int i = 0; i < num_devices; ++i) {
        reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
    }
    return simple_final_reduce_scal(rtype, results, num_devices);
 }
 AcResult
 acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices,
                      const StreamType stream)
 {
    // See the beginning of the file for an explanation of the index mapping
    // #pragma omp parallel for
    for (int i = 0; i < num_devices; ++i) {
        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
        const int3 s0 = src;
        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
        const int3 da = max(s0, d0);
        const int3 db = min(s1, d1);
        /*
        printf("Device %d\n", i);
        printf("\ts0: "); printInt3(s0); printf("\n");
        printf("\td0: "); printInt3(d0); printf("\n");
        printf("\tda: "); printInt3(da); printf("\n");
        printf("\tdb: "); printInt3(db); printf("\n");
        printf("\td1: "); printInt3(d1); printf("\n");
        printf("\ts1: "); printInt3(s1); printf("\n");
        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
        */
        if (db.z >= da.z) {
            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
            // DECOMPOSITION OFFSET HERE
            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
            // printf("\n");
            copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells);
        }
        // printf("\n");
    }
    return AC_SUCCESS;
 }
 AcResult
 acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices)
 {
    return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
 }
 AcResult
 acLoad(const AcMesh host_mesh)
 {
    acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info));
    acSynchronizeStream(STREAM_ALL);
    return AC_SUCCESS;
 }
 AcResult
 acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh,
                       const StreamType stream)
 {
    // See the beginning of the file for an explanation of the index mapping
    // #pragma omp parallel for
    for (int i = 0; i < num_devices; ++i) {
        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
        const int3 s0 = src;
        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
        const int3 da = max(s0, d0);
        const int3 db = min(s1, d1);
        if (db.z >= da.z) {
            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
            // DECOMPOSITION OFFSET HERE
            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
            copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh);
        }
    }
    return AC_SUCCESS;
 }
 AcResult
 acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh)
 {
    return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
 }
 AcResult
 acStore(AcMesh* host_mesh)
 {
    acStoreWithOffset((int3){0, 0, 0}, acVertexBufferSize(host_mesh->info), host_mesh);
    acSynchronizeStream(STREAM_ALL);
    return AC_SUCCESS;
 }
 AcResult
 acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream)
 {
    // #pragma omp parallel for
    for (int i = 0; i < num_devices; ++i) {
        loadDeviceConstant(devices[i], stream, param, value);
    }
    return AC_SUCCESS;
 }
 AcResult
 acLoadDeviceConstant(const AcRealParam param, const AcReal value)
 {
    return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
 }
 /*
 * =============================================================================
 * Revised interface
 * =============================================================================
 */
 =======
 >>>>>>> Stashed changes
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -16,507 +16,7 @@
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-
+#include "astaroth_device.h"
 /**
 * @file
 * \brief Brief info.
 *
 * Detailed info.
 *
 */
 #include "device.cuh"
 #include "errchk.h"
 // Device info
 #define REGISTERS_PER_THREAD (255)
 #define MAX_REGISTERS_PER_BLOCK (65536)
 #define MAX_THREADS_PER_BLOCK (1024)
 #define WARP_SIZE (32)
 typedef struct {
    AcReal* in[NUM_VTXBUF_HANDLES];
    AcReal* out[NUM_VTXBUF_HANDLES];
 } VertexBufferArray;
 __constant__ AcMeshInfo d_mesh_info;
 __constant__ int3 d_multigpu_offset;
 __constant__ Grid globalGrid;
 #define DCONST_INT(X) (d_mesh_info.int_params[X])
 #define DCONST_INT3(X) (d_mesh_info.int3_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
 #define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
 #define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
 #include "kernels/kernels.cuh"
 static dim3 rk3_tpb = (dim3){32, 1, 4};
 #if PACKED_DATA_TRANSFERS // Defined in device.cuh
 // #include "kernels/pack_unpack.cuh"
 #endif
 struct device_s {
    int id;
    AcMeshInfo local_config;
    // Concurrency
    cudaStream_t streams[NUM_STREAM_TYPES];
    // Memory
    VertexBufferArray vba;
    AcReal* reduce_scratchpad;
    AcReal* reduce_result;
 #if PACKED_DATA_TRANSFERS
 // Declare memory for buffers needed for packed data transfers here
 // AcReal* data_packing_buffer;
 #endif
 };
 AcResult
 printDeviceInfo(const Device device)
 {
    const int device_id = device->id;
    cudaDeviceProp props;
    cudaGetDeviceProperties(&props, device_id);
    printf("--------------------------------------------------\n");
    printf("Device Number: %d\n", device_id);
    const size_t bus_id_max_len = 128;
    char bus_id[bus_id_max_len];
    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
    printf("  PCI bus ID: %s\n", bus_id);
    printf("    Device name: %s\n", props.name);
    printf("    Compute capability: %d.%d\n", props.major, props.minor);
    // Compute
    printf("  Compute\n");
    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
    printf("    Stream processors: %d\n", props.multiProcessorCount);
    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
    printf(
        "    Compute mode: %d\n",
        (int)props
            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
    // Memory
    printf("  Global memory\n");
    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
    printf("    ECC enabled: %d\n", props.ECCEnabled);
    // Memory usage
    size_t free_bytes, total_bytes;
    cudaMemGetInfo(&free_bytes, &total_bytes);
    const size_t used_bytes = total_bytes - free_bytes;
    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
    printf("  Caches\n");
    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
    printf("  Other\n");
    printf("    Warp size: %d\n", props.warpSize);
    // printf("    Single to double perf. ratio: %dx\n",
    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
    // versions
    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
    printf("--------------------------------------------------\n");
    return AC_SUCCESS;
 }
 static __global__ void
 dummy_kernel(void)
 {
 }
 AcResult
 createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
 {
    cudaSetDevice(id);
    cudaDeviceReset();
    // Create Device
    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
    ERRCHK_ALWAYS(device);
    device->id           = id;
    device->local_config = device_config;
    // Check that the code was compiled for the proper GPU architecture
    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
           "device supports the CUDA architecture you are compiling for.\n"
           "Running dummy kernel... ");
    fflush(stdout);
    dummy_kernel<<<1, 1>>>();
    ERRCHK_CUDA_KERNEL_ALWAYS();
    printf("Success!\n");
    // Concurrency
    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
    }
    // Memory
    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
    }
    ERRCHK_CUDA_ALWAYS(
        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
 #if PACKED_DATA_TRANSFERS
 // Allocate data required for packed transfers here (cudaMalloc)
 #endif
    // Device constants
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
                                          cudaMemcpyHostToDevice));
    // Multi-GPU offset. This is used to compute globalVertexIdx.
    // Might be better to calculate this in astaroth.cu instead of here, s.t.
    // everything related to the decomposition is limited to the multi-GPU layer
    const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
    printf("Created device %d (%p)\n", device->id, device);
    *device_handle = device;
    // Autoptimize
    if (id == 0)
        autoOptimize(device);
    return AC_SUCCESS;
 }
 AcResult
 destroyDevice(Device device)
 {
    cudaSetDevice(device->id);
    printf("Destroying device %d (%p)\n", device->id, device);
    // Memory
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        cudaFree(device->vba.in[i]);
        cudaFree(device->vba.out[i]);
    }
    cudaFree(device->reduce_scratchpad);
    cudaFree(device->reduce_result);
 #if PACKED_DATA_TRANSFERS
 // Free data required for packed tranfers here (cudaFree)
 #endif
    // Concurrency
    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
        cudaStreamDestroy(device->streams[i]);
    }
    // Destroy Device
    free(device);
    return AC_SUCCESS;
 }
 AcResult
 boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
 {
    cudaSetDevice(device->id);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
    }
    return AC_SUCCESS;
 }
 AcResult
 reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
           const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
    cudaSetDevice(device->id);
    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
                              device->local_config.int_params[AC_ny_min],
                              device->local_config.int_params[AC_nz_min]};
    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
                            device->local_config.int_params[AC_ny_max],
                            device->local_config.int_params[AC_nz_max]};
    *result = reduce_scal(device->streams[stream_type], rtype, start, end,
                          device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
                          device->reduce_result);
    return AC_SUCCESS;
 }
 AcResult
 reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
          const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
          const VertexBufferHandle vtxbuf2, AcReal* result)
 {
    cudaSetDevice(device->id);
    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
                              device->local_config.int_params[AC_ny_min],
                              device->local_config.int_params[AC_nz_min]};
    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
                            device->local_config.int_params[AC_ny_max],
                            device->local_config.int_params[AC_nz_max]};
    *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
                         device->reduce_scratchpad, device->reduce_result);
    return AC_SUCCESS;
 }
 AcResult
 rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
       const int3& end, const AcReal dt)
 {
    cudaSetDevice(device->id);
    // const dim3 tpb(32, 1, 4);
    const dim3 tpb = rk3_tpb;
    const int3 n = end - start;
    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
    if (step_number == 0)
        solve<0><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
    else if (step_number == 1)
        solve<1><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
    else
        solve<2><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
 AcResult
 synchronize(const Device device, const StreamType stream_type)
 {
    cudaSetDevice(device->id);
    if (stream_type == STREAM_ALL) {
        cudaDeviceSynchronize();
    }
    else {
        cudaStreamSynchronize(device->streams[stream_type]);
    }
    return AC_SUCCESS;
 }
 static AcResult
 loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
               const size_t bytes, AcReal* dst)
 {
    cudaSetDevice(device->id);
    ERRCHK_CUDA(
        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
    return AC_SUCCESS;
 }
 static AcResult
 storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
                const size_t bytes, AcReal* dst)
 {
    cudaSetDevice(device->id);
    ERRCHK_CUDA(
        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
    return AC_SUCCESS;
 }
 AcResult
 copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
                 const int3& src, const int3& dst, const int num_vertices)
 {
    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
                       num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
    }
    return AC_SUCCESS;
 }
 AcResult
 copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
               const int num_vertices, AcMesh* host_mesh)
 {
    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
                        num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
    }
    return AC_SUCCESS;
 }
 AcResult
 copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
                       Device dst_device, const int3& dst, const int num_vertices)
 {
    cudaSetDevice(src_device->id);
    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
                                        &src_device->vba.in[i][src_idx], src_device->id,
                                        sizeof(src_device->vba.in[i][0]) * num_vertices,
                                        src_device->streams[stream_type]));
    }
    return AC_SUCCESS;
 }
 AcResult
 swapBuffers(const Device device)
 {
    cudaSetDevice(device->id);
    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
        AcReal* tmp        = device->vba.in[i];
        device->vba.in[i]  = device->vba.out[i];
        device->vba.out[i] = tmp;
    }
    return AC_SUCCESS;
 }
 AcResult
 loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param,
                   const int value)
 {
    cudaSetDevice(device->id);
    // CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here)
    // and something like d_mesh_info.real_params[] cannot be directly accessed.
    // Therefore we have to obfuscate the code a bit and compute the offset address before
    // invoking cudaMemcpyToSymbol.
    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
                                               cudaMemcpyHostToDevice,
                                               device->streams[stream_type]));
    return AC_SUCCESS;
 }
 AcResult
 loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param,
                   const AcReal value)
 {
    cudaSetDevice(device->id);
    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
                                               cudaMemcpyHostToDevice,
                                               device->streams[stream_type]));
    return AC_SUCCESS;
 }
 AcResult
 loadGlobalGrid(const Device device, const Grid grid)
 {
    cudaSetDevice(device->id);
    ERRCHK_CUDA_ALWAYS(
        cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
    return AC_SUCCESS;
 }
 AcResult
 autoOptimize(const Device device)
 {
    cudaSetDevice(device->id);
    // RK3
    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
    const int3 end   = start + (int3){device->local_config.int_params[AC_nx], //
                                    device->local_config.int_params[AC_ny], //
                                    device->local_config.int_params[AC_nz]};
    dim3 best_dims(0, 0, 0);
    float best_time          = INFINITY;
    const int num_iterations = 10;
    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
                    break;
                if (x * y * z > MAX_THREADS_PER_BLOCK)
                    break;
                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
                    break;
                if (((x * y * z) % WARP_SIZE) != 0)
                    continue;
                const dim3 tpb(x, y, z);
                const int3 n = end - start;
                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
                cudaDeviceSynchronize();
                if (cudaGetLastError() != cudaSuccess) // resets the error if any
                    continue;
                // printf("(%d, %d, %d)\n", x, y, z);
                cudaEvent_t tstart, tstop;
                cudaEventCreate(&tstart);
                cudaEventCreate(&tstop);
                cudaEventRecord(tstart); // ---------------------------------------- Timing start
                for (int i = 0; i < num_iterations; ++i)
                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
                cudaEventRecord(tstop); // ----------------------------------------- Timing end
                cudaEventSynchronize(tstop);
                float milliseconds = 0;
                cudaEventElapsedTime(&milliseconds, tstart, tstop);
                ERRCHK_CUDA_KERNEL_ALWAYS();
                if (milliseconds < best_time) {
                    best_time = milliseconds;
                    best_dims = tpb;
                }
            }
        }
    }
 #if VERBOSE_PRINTING
    printf(
        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
 #endif
    /*
    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
    ERRCHK(fp);
    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
    fclose(fp);
    */
    rk3_tpb = best_dims;
    return AC_SUCCESS;
 }
 #if PACKED_DATA_TRANSFERS
 // Functions for calling packed data transfers
 #endif
 /*
 * =============================================================================
 * Revised interface
 * =============================================================================
 */
--- a/src/core/grid.cu
+++ b/src/core/grid.cu
@@ -0,0 +1,19 @@
 /*
    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
    This file is part of Astaroth.
    Astaroth is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    Astaroth is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "astaroth_grid.h"
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -16,7 +16,7 @@
    You should have received a copy of the GNU General Public License
    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-// #include "astaroth_node.h"
+#include "astaroth_node.h"
 struct node_s {
 };