From 5be775dbff99e516c17efdb78a3f8b423f97d039 Mon Sep 17 00:00:00 2001
From: jpekkila <johannes.pekkila@aalto.fi>
Date: Wed, 31 Jul 2019 17:48:48 +0300
Subject: [PATCH] Various intermediate changes

---
 include/astaroth_device.h |  21 +-
 include/astaroth_grid.h   |  11 +-
 include/astaroth_node.h   |  13 +-
 src/core/CMakeLists.txt   |   2 +-
 src/core/astaroth.cu      | 658 +-------------------------------------
 src/core/device.cu        | 502 +----------------------------
 src/core/grid.cu          |  19 ++
 src/core/node.cu          |   2 +-
 8 files changed, 43 insertions(+), 1185 deletions(-)

diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index d98ffcb..48c8db2 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -103,21 +103,18 @@ AcResult acDeviceTransferVertexBuffer(const Device src_device, const Stream stre
 AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device);
 
 /** */
-AcResult acDeviceIntegrateSubstep(const Device device, const StreamType stream_type,
-                                  const int step_number, const int3 start, const int3 end,
-                                  const AcReal dt);
+AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                                  const int3 start, const int3 end, const AcReal dt);
 /** */
-AcResult acDevicePeriodicBoundcondStep(const Device device, const StreamType stream_type,
-                                       const int3 start, const int3 end);
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const int3 start,
+                                       const int3 end);
 /** */
-AcResult acDeviceReduceScal(const Device device, const StreamType stream_type,
-                            const ReductionType rtype, const VertexBufferHandle vtxbuf_handle,
-                            AcReal* result);
+AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                            const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acDeviceReduceVec(const Device device, const StreamType stream_type,
-                           const ReductionType rtype, const VertexBufferHandle vec0,
-                           const VertexBufferHandle vec1, const VertexBufferHandle vec2,
-                           AcReal* result);
+AcResult acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
+                           const VertexBufferHandle vec0, const VertexBufferHandle vec1,
+                           const VertexBufferHandle vec2, AcReal* result);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/include/astaroth_grid.h b/include/astaroth_grid.h
index 4200343..7dfe323 100644
--- a/include/astaroth_grid.h
+++ b/include/astaroth_grid.h
@@ -89,16 +89,15 @@ AcResult acGridTransferVertexBuffer(const Stream stream, const VertexBufferHandl
 AcResult acGridTransferMesh(const Stream stream);
 
 /** */
-AcResult acGridIntegrateSubstep(const StreamType stream_type, const int step_number,
-                                const int3 start, const int3 end, const AcReal dt);
+AcResult acGridIntegrateSubstep(const Stream stream, const int step_number, const int3 start,
+                                const int3 end, const AcReal dt);
 /** */
-AcResult acGridPeriodicBoundcondStep(const StreamType stream_type, const int3 start,
-                                     const int3 end);
+AcResult acGridPeriodicBoundcondStep(const Stream stream, const int3 start, const int3 end);
 /** */
-AcResult acGridReduceScal(const StreamType stream_type, const ReductionType rtype,
+AcResult acGridReduceScal(const Stream stream, const ReductionType rtype,
                           const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acGridReduceVec(const StreamType stream_type, const ReductionType rtype,
+AcResult acGridReduceVec(const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vec0, const VertexBufferHandle vec1,
                          const VertexBufferHandle vec2, AcReal* result);
 
diff --git a/include/astaroth_node.h b/include/astaroth_node.h
index 83e0a45..216db25 100644
--- a/include/astaroth_node.h
+++ b/include/astaroth_node.h
@@ -104,17 +104,16 @@ AcResult acNodeTransferVertexBuffer(const Node src_node, const Stream stream,
 AcResult acNodeTransferMesh(const Node src_node, const Stream stream, Node* dst_node);
 
 /** */
-AcResult acNodeIntegrateSubstep(const Node node, const StreamType stream_type,
-                                const int step_number, const int3 start, const int3 end,
-                                const AcReal dt);
+AcResult acNodeIntegrateSubstep(const Node node, const Stream stream, const int step_number,
+                                const int3 start, const int3 end, const AcReal dt);
 /** */
-AcResult acNodePeriodicBoundcondStep(const Node node, const StreamType stream_type,
-                                     const int3 start, const int3 end);
+AcResult acNodePeriodicBoundcondStep(const Node node, const Stream stream, const int3 start,
+                                     const int3 end);
 /** */
-AcResult acNodeReduceScal(const Node node, const StreamType stream_type, const ReductionType rtype,
+AcResult acNodeReduceScal(const Node node, const Stream stream, const ReductionType rtype,
                           const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acNodeReduceVec(const Node node, const StreamType stream_type, const ReductionType rtype,
+AcResult acNodeReduceVec(const Node node, const Stream stream, const ReductionType rtype,
                          const VertexBufferHandle vec0, const VertexBufferHandle vec1,
                          const VertexBufferHandle vec2, AcReal* result);
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 5cbc271..6054444 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -3,7 +3,7 @@
 ########################################
 
 ## Find packages
-find_package(CUDA 9 REQUIRED)
+find_package(CUDA REQUIRED)
 
 ## Architecture and optimization flags
 set(CUDA_ARCH_FLAGS -gencode arch=compute_37,code=sm_37
diff --git a/src/core/astaroth.cu b/src/core/astaroth.cu
index 1cb9f1e..1ef56d4 100644
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -16,660 +16,4 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-
-/**
- * @file
- * \brief Multi-GPU implementation.
- *
- %JP: The old way for computing boundary conditions conflicts with the
- way we have to do things with multiple GPUs.
-
- The older approach relied on unified memory, which represented the whole
- memory area as one huge mesh instead of several smaller ones. However, unified memory
- in its current state is more meant for quick prototyping when performance is not an issue.
- Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult
- than when managing the memory explicitly.
-
- In this new approach, I have simplified the multi- and single-GPU layers significantly.
- Quick rundown:
-         New struct: Grid. There are two global variables, "grid" and "subgrid", which
-         contain the extents of the whole simulation domain and the decomposed grids,
- respectively. To simplify thing, we require that each GPU is assigned the same amount of
- work, therefore each GPU in the node is assigned and "subgrid.m" -sized block of data to
- work with.
-
-         The whole simulation domain is decomposed with respect to the z dimension.
-         For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
-         contain (nx, ny, nz / num_devices) vertices.
-
-         An local index (i, j, k) in some subgrid can be mapped to the global grid with
-                 global idx = (i, j, k + device_id * subgrid.n.z)
-
- Terminology:
-         - Single-GPU function: a function defined on the single-GPU layer (device.cu)
-
- Changes required to this commented code block:
-         - The thread block dimensions (tpb) are no longer passed to the kernel here but in
- device.cu instead. Same holds for any complex index calculations. Instead, the local
- coordinates should be passed as an int3 type without having to consider how the data is
- actually laid out in device memory
-         - The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque
- handle of type "Device" which should be passed to single-GPU functions. In this file, all
- devices are stored in a global array "devices[num_devices]".
-         - Every single-GPU function is executed asynchronously by default such that we
-           can optimize Astaroth by executing memory transactions concurrently with
- computation. Therefore a StreamType should be passed as a parameter to single-GPU functions.
-           Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
-           as a parameter and commands executing in different streams can be processed
-           in parallel/concurrently.
-
-
- Note on periodic boundaries (might be helpful when implementing other boundary conditions):
-
-         With multiple GPUs, periodic boundary conditions applied on indices ranging from
-
-                 (0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z -
- STENCIL_ORDER/2)
-
-         on a single device are "local", in the sense that they can be computed without
- having to exchange data with neighboring GPUs. Special care is needed only for transferring
-         the data to the fron and back plates outside this range. In the solution we use
- here, we solve the local boundaries first, and then just exchange the front and back plates
-         in a "ring", like so
-                                 device_id
-                     (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
-
-### Throughout this file we use the following notation and names for various index offsets
-
-    Global coordinates: coordinates with respect to the global grid (static Grid grid)
-    Local coordinates: coordinates with respect to the local subgrid (static Subgrid subgrid)
-
-    s0, s1: source indices in global coordinates
-    d0, d1: destination indices in global coordinates
-    da = max(s0, d0);
-    db = min(s1, d1);
-
-    These are used in at least
-    acLoad()
-    acStore()
-    acSynchronizeHalos()
-
-     Here we decompose the host mesh and distribute it among the GPUs in
-     the node.
-
-     The host mesh is a huge contiguous block of data. Its dimensions are given by
-     the global variable named "grid". A "grid" is decomposed into "subgrids",
-     one for each GPU. Here we check which parts of the range s0...s1 maps
-     to the memory space stored by some GPU, ranging d0...d1, and transfer
-     the data if needed.
-
-     The index mapping is inherently quite involved, but here's a picture which
-     hopefully helps make sense out of all this.
-
-
-     Grid
-                                      |----num_vertices---|
-     xxx|....................................................|xxx
-              ^                   ^   ^                   ^
-             d0                  d1  s0 (src)            s1
-
-     Subgrid
-
-              xxx|.............|xxx
-              ^                   ^
-             d0                  d1
-
-                                  ^   ^
-                                 db  da
- *
- */
-#include "astaroth.h"
-#include "errchk.h"
-
-#include "device.cuh"
-#include "math_utils.h" // sum for reductions
-// #include "standalone/config_loader.h" // update_config
-
-#define AC_GEN_STR(X) #X
-const char* intparam_names[]   = {AC_FOR_BUILTIN_INT_PARAM_TYPES(AC_GEN_STR) //
-                                AC_FOR_USER_INT_PARAM_TYPES(AC_GEN_STR)};
-const char* int3param_names[]  = {AC_FOR_BUILTIN_INT3_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_INT3_PARAM_TYPES(AC_GEN_STR)};
-const char* realparam_names[]  = {AC_FOR_BUILTIN_REAL_PARAM_TYPES(AC_GEN_STR) //
-                                 AC_FOR_USER_REAL_PARAM_TYPES(AC_GEN_STR)};
-const char* real3param_names[] = {AC_FOR_BUILTIN_REAL3_PARAM_TYPES(AC_GEN_STR) //
-                                  AC_FOR_USER_REAL3_PARAM_TYPES(AC_GEN_STR)};
-const char* vtxbuf_names[]     = {AC_FOR_VTXBUF_HANDLES(AC_GEN_STR)};
-#undef AC_GEN_STR
-
-static const int MAX_NUM_DEVICES       = 32;
-static int num_devices                 = 0;
-static Device devices[MAX_NUM_DEVICES] = {};
-
-static Grid grid; // A grid consists of num_devices subgrids
-static Grid subgrid;
-
-static int
-gridIdx(const Grid grid, const int3 idx)
-{
-    return idx.x + idx.y * grid.m.x + idx.z * grid.m.x * grid.m.y;
-}
-
-static int3
-gridIdx3d(const Grid grid, const int idx)
-{
-    return (int3){idx % grid.m.x, (idx % (grid.m.x * grid.m.y)) / grid.m.x,
-                  idx / (grid.m.x * grid.m.y)};
-}
-
-static void
-printInt3(const int3 vec)
-{
-    printf("(%d, %d, %d)", vec.x, vec.y, vec.z);
-}
-
-static inline void
-print(const AcMeshInfo config)
-{
-    for (int i = 0; i < NUM_INT_PARAMS; ++i)
-        printf("[%s]: %d\n", intparam_names[i], config.int_params[i]);
-    for (int i = 0; i < NUM_REAL_PARAMS; ++i)
-        printf("[%s]: %g\n", realparam_names[i], double(config.real_params[i]));
-}
-
-static void
-update_builtin_params(AcMeshInfo* config)
-{
-    config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER;
-    ///////////// PAD TEST
-    // config->int_params[AC_mx] = config->int_params[AC_nx] + STENCIL_ORDER + PAD_SIZE;
-    ///////////// PAD TEST
-    config->int_params[AC_my] = config->int_params[AC_ny] + STENCIL_ORDER;
-    config->int_params[AC_mz] = config->int_params[AC_nz] + STENCIL_ORDER;
-
-    // Bounds for the computational domain, i.e. nx_min <= i < nx_max
-    config->int_params[AC_nx_min] = NGHOST;
-    config->int_params[AC_nx_max] = config->int_params[AC_nx_min] + config->int_params[AC_nx];
-    config->int_params[AC_ny_min] = NGHOST;
-    config->int_params[AC_ny_max] = config->int_params[AC_ny] + NGHOST;
-    config->int_params[AC_nz_min] = NGHOST;
-    config->int_params[AC_nz_max] = config->int_params[AC_nz] + NGHOST;
-
-    /* Additional helper params */
-    // Int helpers
-    config->int_params[AC_mxy]  = config->int_params[AC_mx] * config->int_params[AC_my];
-    config->int_params[AC_nxy]  = config->int_params[AC_nx] * config->int_params[AC_ny];
-    config->int_params[AC_nxyz] = config->int_params[AC_nxy] * config->int_params[AC_nz];
-
-#if VERBOSE_PRINTING // Defined in astaroth.h
-    printf("###############################################################\n");
-    printf("Config dimensions recalculated:\n");
-    print(*config);
-    printf("###############################################################\n");
-#endif
-}
-
-static Grid
-createGrid(const AcMeshInfo config)
-{
-    Grid grid;
-
-    grid.m = (int3){config.int_params[AC_mx], config.int_params[AC_my], config.int_params[AC_mz]};
-    grid.n = (int3){config.int_params[AC_nx], config.int_params[AC_ny], config.int_params[AC_nz]};
-
-    return grid;
-}
-
-AcResult
-acCheckDeviceAvailability(void)
-{
-    int device_count; // Separate from num_devices to avoid side effects
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&device_count));
-    if (device_count > 0)
-        return AC_SUCCESS;
-    else
-        return AC_FAILURE;
-}
-
-AcResult
-acSynchronizeStream(const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        synchronize(devices[i], stream);
-    }
-
-    return AC_SUCCESS;
-}
-
-static AcResult
-synchronize_halos(const StreamType stream)
-{
-    // Exchanges the halos of subgrids
-    // After this step, the data within the main grid ranging from
-    // (0, 0, NGHOST) -> grid.m.x, grid.m.y, NGHOST + grid.n.z
-    // has been synchronized and transferred to appropriate subgrids
-
-    // We loop only to num_devices - 1 since the front and back plate of the grid is not
-    // transferred because their contents depend on the boundary conditions.
-
-    // IMPORTANT NOTE: the boundary conditions must be applied before calling this function!
-    // I.e. the halos of subgrids must contain up-to-date data!
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices - 1; ++i) {
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[i], stream, src, devices[(i + 1) % num_devices], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[(i + 1) % num_devices], stream, src, devices[i], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acSynchronizeMesh(void)
-{
-    acSynchronizeStream(STREAM_ALL);
-    synchronize_halos(STREAM_DEFAULT);
-    acSynchronizeStream(STREAM_ALL);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-acInit(const AcMeshInfo config)
-{
-    // Get num_devices
-    ERRCHK_CUDA_ALWAYS(cudaGetDeviceCount(&num_devices));
-    if (num_devices < 1) {
-        ERROR("No CUDA devices found!");
-        return AC_FAILURE;
-    }
-    if (num_devices > MAX_NUM_DEVICES) {
-        WARNING("More devices found than MAX_NUM_DEVICES. Using only MAX_NUM_DEVICES");
-        num_devices = MAX_NUM_DEVICES;
-    }
-    if (!AC_MULTIGPU_ENABLED) {
-        WARNING("MULTIGPU_ENABLED was false. Using only one device");
-        num_devices = 1; // Use only one device if multi-GPU is not enabled
-    }
-    // Check that num_devices is divisible with AC_nz. This makes decomposing the
-    // problem domain to multiple GPUs much easier since we do not have to worry
-    // about remainders
-    ERRCHK_ALWAYS(config.int_params[AC_nz] % num_devices == 0);
-
-    // Decompose the problem domain
-    // The main grid
-    grid = createGrid(config);
-
-    // Subgrids
-    AcMeshInfo subgrid_config = config;
-    subgrid_config.int_params[AC_nz] /= num_devices;
-    update_builtin_params(&subgrid_config);
-    subgrid = createGrid(subgrid_config);
-
-    // Periodic boundary conditions become weird if the system can "fold unto itself".
-    ERRCHK_ALWAYS(subgrid.n.x >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.y >= STENCIL_ORDER);
-    ERRCHK_ALWAYS(subgrid.n.z >= STENCIL_ORDER);
-
-#if VERBOSE_PRINTING
-    // clang-format off
-    printf("Grid m ");   printInt3(grid.m);    printf("\n");
-    printf("Grid n ");   printInt3(grid.n);    printf("\n");
-    printf("Subrid m "); printInt3(subgrid.m); printf("\n");
-    printf("Subrid n "); printInt3(subgrid.n); printf("\n");
-    // clang-format on
-#endif
-
-    // Initialize the devices
-    for (int i = 0; i < num_devices; ++i) {
-        createDevice(i, subgrid_config, &devices[i]);
-        loadGlobalGrid(devices[i], grid);
-        printDeviceInfo(devices[i]);
-    }
-
-    // Enable peer access
-    for (int i = 0; i < num_devices; ++i) {
-        const int front = (i + 1) % num_devices;
-        const int back  = (i - 1 + num_devices) % num_devices;
-
-        int can_access_front, can_access_back;
-        cudaDeviceCanAccessPeer(&can_access_front, i, front);
-        cudaDeviceCanAccessPeer(&can_access_back, i, back);
-#if VERBOSE_PRINTING
-        printf(
-            "Trying to enable peer access from %d to %d (can access: %d) and %d (can access: %d)\n",
-            i, front, can_access_front, back, can_access_back);
-#endif
-
-        cudaSetDevice(i);
-        if (can_access_front) {
-            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(front, 0));
-        }
-        if (can_access_back) {
-            ERRCHK_CUDA_ALWAYS(cudaDeviceEnablePeerAccess(back, 0));
-        }
-    }
-
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
-}
-
-AcResult
-acQuit(void)
-{
-    acSynchronizeStream(STREAM_ALL);
-
-    for (int i = 0; i < num_devices; ++i) {
-        destroyDevice(devices[i]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStepWithOffsetAsync(const int isubstep, const AcReal dt, const int3 start,
-                               const int3 end, const StreamType stream)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        // DECOMPOSITION OFFSET HERE
-        const int3 d0 = (int3){NGHOST, NGHOST, NGHOST + i * subgrid.n.z};
-        const int3 d1 = d0 + (int3){subgrid.n.x, subgrid.n.y, subgrid.n.z};
-
-        const int3 da = max(start, d0);
-        const int3 db = min(end, d1);
-
-        if (db.z >= da.z) {
-            const int3 da_local = da - (int3){0, 0, i * subgrid.n.z};
-            const int3 db_local = db - (int3){0, 0, i * subgrid.n.z};
-            rkStep(devices[i], stream, isubstep, da_local, db_local, dt);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrateStepWithOffset(const int isubstep, const AcReal dt, const int3 start, const int3 end)
-{
-    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, STREAM_DEFAULT);
-}
-
-AcResult
-acIntegrateStepAsync(const int isubstep, const AcReal dt, const StreamType stream)
-{
-    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + grid.n;
-    return acIntegrateStepWithOffsetAsync(isubstep, dt, start, end, stream);
-}
-
-AcResult
-acIntegrateStep(const int isubstep, const AcReal dt)
-{
-    return acIntegrateStepAsync(isubstep, dt, STREAM_DEFAULT);
-}
-
-static AcResult
-local_boundcondstep(const StreamType stream)
-{
-    if (num_devices == 1) {
-        boundcondStep(devices[0], stream, (int3){0, 0, 0}, subgrid.m);
-    }
-    else {
-        // Local boundary conditions
-        // #pragma omp parallel for
-        for (int i = 0; i < num_devices; ++i) {
-            const int3 d0 = (int3){0, 0, NGHOST}; // DECOMPOSITION OFFSET HERE
-            const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.n.z};
-            boundcondStep(devices[i], stream, d0, d1);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-global_boundcondstep(const StreamType stream)
-{
-    if (num_devices > 1) {
-        // With periodic boundary conditions we exchange the front and back plates of the
-        // grid. The exchange is done between the first and last device (0 and num_devices - 1).
-        const int num_vertices = subgrid.m.x * subgrid.m.y * NGHOST;
-        // ...|ooooxxx|... -> xxx|ooooooo|...
-        {
-            const int3 src = (int3){0, 0, subgrid.n.z};
-            const int3 dst = (int3){0, 0, 0};
-            copyMeshDeviceToDevice(devices[num_devices - 1], stream, src, devices[0], dst,
-                                   num_vertices);
-        }
-        // ...|ooooooo|xxx <- ...|xxxoooo|...
-        {
-            const int3 src = (int3){0, 0, NGHOST};
-            const int3 dst = (int3){0, 0, NGHOST + subgrid.n.z};
-            copyMeshDeviceToDevice(devices[0], stream, src, devices[num_devices - 1], dst,
-                                   num_vertices);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStepAsync(const StreamType stream)
-{
-    ERRCHK_ALWAYS(stream < NUM_STREAM_TYPES);
-
-    local_boundcondstep(stream);
-    acSynchronizeStream(stream);
-    global_boundcondstep(stream);
-    synchronize_halos(stream);
-    acSynchronizeStream(stream);
-    return AC_SUCCESS;
-}
-
-AcResult
-acBoundcondStep(void)
-{
-    return acBoundcondStepAsync(STREAM_DEFAULT);
-}
-
-static AcResult
-swap_buffers(void)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        swapBuffers(devices[i]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acIntegrate(const AcReal dt)
-{
-    acSynchronizeStream(STREAM_ALL);
-    for (int isubstep = 0; isubstep < 3; ++isubstep) {
-        acIntegrateStep(isubstep, dt); // Note: boundaries must be initialized.
-        swap_buffers();
-        acBoundcondStep();
-    }
-    return AC_SUCCESS;
-}
-
-static AcReal
-simple_final_reduce_scal(const ReductionType rtype, const AcReal* results, const int n)
-{
-    AcReal res = results[0];
-    for (int i = 1; i < n; ++i) {
-        if (rtype == RTYPE_MAX) {
-            res = max(res, results[i]);
-        }
-        else if (rtype == RTYPE_MIN) {
-            res = min(res, results[i]);
-        }
-        else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
-            res = sum(res, results[i]);
-        }
-        else {
-            ERROR("Invalid rtype");
-        }
-    }
-
-    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
-        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
-        res                = sqrt(inv_n * res);
-    }
-    return res;
-}
-
-AcReal
-acReduceScal(const ReductionType rtype, const VertexBufferHandle vtxbuffer_handle)
-{
-    acSynchronizeStream(STREAM_ALL);
-
-    AcReal results[num_devices];
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceScal(devices[i], STREAM_DEFAULT, rtype, vtxbuffer_handle, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
-AcReal
-acReduceVec(const ReductionType rtype, const VertexBufferHandle a, const VertexBufferHandle b,
-            const VertexBufferHandle c)
-{
-    acSynchronizeStream(STREAM_ALL);
-
-    AcReal results[num_devices];
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        reduceVec(devices[i], STREAM_DEFAULT, rtype, a, b, c, &results[i]);
-    }
-
-    return simple_final_reduce_scal(rtype, results, num_devices);
-}
-
-AcResult
-acLoadWithOffsetAsync(const AcMesh host_mesh, const int3 src, const int num_vertices,
-                      const StreamType stream)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
-
-        const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
-
-        const int3 da = max(s0, d0);
-        const int3 db = min(s1, d1);
-        /*
-        printf("Device %d\n", i);
-        printf("\ts0: "); printInt3(s0); printf("\n");
-        printf("\td0: "); printInt3(d0); printf("\n");
-        printf("\tda: "); printInt3(da); printf("\n");
-        printf("\tdb: "); printInt3(db); printf("\n");
-        printf("\td1: "); printInt3(d1); printf("\n");
-        printf("\ts1: "); printInt3(s1); printf("\n");
-        printf("\t-> %s to device %d\n", db.z >= da.z ? "Copy" : "Do not copy", i);
-        */
-        if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
-            // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            // printf("\t\tcopy %d cells to local index ", copy_cells); printInt3(da_local);
-            // printf("\n");
-            copyMeshToDevice(devices[i], stream, host_mesh, da, da_local, copy_cells);
-        }
-        // printf("\n");
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acLoadWithOffset(const AcMesh host_mesh, const int3 src, const int num_vertices)
-{
-    return acLoadWithOffsetAsync(host_mesh, src, num_vertices, STREAM_DEFAULT);
-}
-
-AcResult
-acLoad(const AcMesh host_mesh)
-{
-    acLoadWithOffset(host_mesh, (int3){0, 0, 0}, acVertexBufferSize(host_mesh.info));
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
-}
-
-AcResult
-acStoreWithOffsetAsync(const int3 src, const int num_vertices, AcMesh* host_mesh,
-                       const StreamType stream)
-{
-    // See the beginning of the file for an explanation of the index mapping
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        const int3 d0 = (int3){0, 0, i * subgrid.n.z}; // DECOMPOSITION OFFSET HERE
-        const int3 d1 = (int3){subgrid.m.x, subgrid.m.y, d0.z + subgrid.m.z};
-
-        const int3 s0 = src;
-        const int3 s1 = gridIdx3d(grid, gridIdx(grid, s0) + num_vertices);
-
-        const int3 da = max(s0, d0);
-        const int3 db = min(s1, d1);
-        if (db.z >= da.z) {
-            const int copy_cells = gridIdx(subgrid, db) - gridIdx(subgrid, da);
-            // DECOMPOSITION OFFSET HERE
-            const int3 da_local = (int3){da.x, da.y, da.z - i * grid.n.z / num_devices};
-            copyMeshToHost(devices[i], stream, da_local, da, copy_cells, host_mesh);
-        }
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acStoreWithOffset(const int3 src, const int num_vertices, AcMesh* host_mesh)
-{
-    return acStoreWithOffsetAsync(src, num_vertices, host_mesh, STREAM_DEFAULT);
-}
-
-AcResult
-acStore(AcMesh* host_mesh)
-{
-    acStoreWithOffset((int3){0, 0, 0}, acVertexBufferSize(host_mesh->info), host_mesh);
-    acSynchronizeStream(STREAM_ALL);
-    return AC_SUCCESS;
-}
-
-AcResult
-acLoadDeviceConstantAsync(const AcRealParam param, const AcReal value, const StreamType stream)
-{
-    // #pragma omp parallel for
-    for (int i = 0; i < num_devices; ++i) {
-        loadDeviceConstant(devices[i], stream, param, value);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-acLoadDeviceConstant(const AcRealParam param, const AcReal value)
-{
-    return acLoadDeviceConstantAsync(param, value, STREAM_DEFAULT);
-}
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
-=======
->>>>>>> Stashed changes
+#include "astaroth_defines.h"
diff --git a/src/core/device.cu b/src/core/device.cu
index c046d18..5d7dd69 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -16,507 +16,7 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-
-/**
- * @file
- * \brief Brief info.
- *
- * Detailed info.
- *
- */
-#include "device.cuh"
-
-#include "errchk.h"
-
-// Device info
-#define REGISTERS_PER_THREAD (255)
-#define MAX_REGISTERS_PER_BLOCK (65536)
-#define MAX_THREADS_PER_BLOCK (1024)
-#define WARP_SIZE (32)
-
-typedef struct {
-    AcReal* in[NUM_VTXBUF_HANDLES];
-    AcReal* out[NUM_VTXBUF_HANDLES];
-} VertexBufferArray;
-
-__constant__ AcMeshInfo d_mesh_info;
-__constant__ int3 d_multigpu_offset;
-__constant__ Grid globalGrid;
-#define DCONST_INT(X) (d_mesh_info.int_params[X])
-#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
-#define DCONST_REAL(X) (d_mesh_info.real_params[X])
-#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
-#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
-#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
-#include "kernels/kernels.cuh"
-
-static dim3 rk3_tpb = (dim3){32, 1, 4};
-
-#if PACKED_DATA_TRANSFERS // Defined in device.cuh
-// #include "kernels/pack_unpack.cuh"
-#endif
+#include "astaroth_device.h"
 
 struct device_s {
-    int id;
-    AcMeshInfo local_config;
-
-    // Concurrency
-    cudaStream_t streams[NUM_STREAM_TYPES];
-
-    // Memory
-    VertexBufferArray vba;
-    AcReal* reduce_scratchpad;
-    AcReal* reduce_result;
-
-#if PACKED_DATA_TRANSFERS
-// Declare memory for buffers needed for packed data transfers here
-// AcReal* data_packing_buffer;
-#endif
 };
-
-AcResult
-printDeviceInfo(const Device device)
-{
-    const int device_id = device->id;
-
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, device_id);
-    printf("--------------------------------------------------\n");
-    printf("Device Number: %d\n", device_id);
-    const size_t bus_id_max_len = 128;
-    char bus_id[bus_id_max_len];
-    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
-    printf("  PCI bus ID: %s\n", bus_id);
-    printf("    Device name: %s\n", props.name);
-    printf("    Compute capability: %d.%d\n", props.major, props.minor);
-
-    // Compute
-    printf("  Compute\n");
-    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
-    printf("    Stream processors: %d\n", props.multiProcessorCount);
-    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
-    printf(
-        "    Compute mode: %d\n",
-        (int)props
-            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
-    // Memory
-    printf("  Global memory\n");
-    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
-    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
-    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
-           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
-    printf("    ECC enabled: %d\n", props.ECCEnabled);
-
-    // Memory usage
-    size_t free_bytes, total_bytes;
-    cudaMemGetInfo(&free_bytes, &total_bytes);
-    const size_t used_bytes = total_bytes - free_bytes;
-    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
-    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
-    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
-    printf("  Caches\n");
-    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
-    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
-    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
-    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
-    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
-    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
-    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
-    printf("  Other\n");
-    printf("    Warp size: %d\n", props.warpSize);
-    // printf("    Single to double perf. ratio: %dx\n",
-    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
-    // versions
-    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
-    printf("--------------------------------------------------\n");
-
-    return AC_SUCCESS;
-}
-
-static __global__ void
-dummy_kernel(void)
-{
-}
-
-AcResult
-createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
-{
-    cudaSetDevice(id);
-    cudaDeviceReset();
-
-    // Create Device
-    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
-    ERRCHK_ALWAYS(device);
-
-    device->id           = id;
-    device->local_config = device_config;
-
-    // Check that the code was compiled for the proper GPU architecture
-    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
-           "device supports the CUDA architecture you are compiling for.\n"
-           "Running dummy kernel... ");
-    fflush(stdout);
-    dummy_kernel<<<1, 1>>>();
-    ERRCHK_CUDA_KERNEL_ALWAYS();
-    printf("Success!\n");
-
-    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
-    }
-
-    // Memory
-    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
-        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
-    }
-    ERRCHK_CUDA_ALWAYS(
-        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
-    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
-
-#if PACKED_DATA_TRANSFERS
-// Allocate data required for packed transfers here (cudaMalloc)
-#endif
-
-    // Device constants
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
-                                          cudaMemcpyHostToDevice));
-
-    // Multi-GPU offset. This is used to compute globalVertexIdx.
-    // Might be better to calculate this in astaroth.cu instead of here, s.t.
-    // everything related to the decomposition is limited to the multi-GPU layer
-    const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
-                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
-
-    printf("Created device %d (%p)\n", device->id, device);
-    *device_handle = device;
-
-    // Autoptimize
-    if (id == 0)
-        autoOptimize(device);
-
-    return AC_SUCCESS;
-}
-
-AcResult
-destroyDevice(Device device)
-{
-    cudaSetDevice(device->id);
-    printf("Destroying device %d (%p)\n", device->id, device);
-
-    // Memory
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        cudaFree(device->vba.in[i]);
-        cudaFree(device->vba.out[i]);
-    }
-    cudaFree(device->reduce_scratchpad);
-    cudaFree(device->reduce_result);
-
-#if PACKED_DATA_TRANSFERS
-// Free data required for packed tranfers here (cudaFree)
-#endif
-
-    // Concurrency
-    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
-        cudaStreamDestroy(device->streams[i]);
-    }
-
-    // Destroy Device
-    free(device);
-    return AC_SUCCESS;
-}
-
-AcResult
-boundcondStep(const Device device, const StreamType stream_type, const int3& start, const int3& end)
-{
-    cudaSetDevice(device->id);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        periodic_boundconds(device->streams[stream_type], start, end, device->vba.in[i]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
-           const VertexBufferHandle vtxbuf_handle, AcReal* result)
-{
-    cudaSetDevice(device->id);
-
-    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
-                              device->local_config.int_params[AC_ny_min],
-                              device->local_config.int_params[AC_nz_min]};
-
-    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
-                            device->local_config.int_params[AC_ny_max],
-                            device->local_config.int_params[AC_nz_max]};
-
-    *result = reduce_scal(device->streams[stream_type], rtype, start, end,
-                          device->vba.in[vtxbuf_handle], device->reduce_scratchpad,
-                          device->reduce_result);
-    return AC_SUCCESS;
-}
-
-AcResult
-reduceVec(const Device device, const StreamType stream_type, const ReductionType rtype,
-          const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
-          const VertexBufferHandle vtxbuf2, AcReal* result)
-{
-    cudaSetDevice(device->id);
-
-    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
-                              device->local_config.int_params[AC_ny_min],
-                              device->local_config.int_params[AC_nz_min]};
-
-    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
-                            device->local_config.int_params[AC_ny_max],
-                            device->local_config.int_params[AC_nz_max]};
-
-    *result = reduce_vec(device->streams[stream_type], rtype, start, end, device->vba.in[vtxbuf0],
-                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
-                         device->reduce_scratchpad, device->reduce_result);
-    return AC_SUCCESS;
-}
-
-AcResult
-rkStep(const Device device, const StreamType stream_type, const int step_number, const int3& start,
-       const int3& end, const AcReal dt)
-{
-    cudaSetDevice(device->id);
-
-    // const dim3 tpb(32, 1, 4);
-    const dim3 tpb = rk3_tpb;
-
-    const int3 n = end - start;
-    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-    if (step_number == 0)
-        solve<0><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-    else if (step_number == 1)
-        solve<1><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-    else
-        solve<2><<<bpg, tpb, 0, device->streams[stream_type]>>>(start, end, device->vba, dt);
-
-    ERRCHK_CUDA_KERNEL();
-
-    return AC_SUCCESS;
-}
-
-AcResult
-synchronize(const Device device, const StreamType stream_type)
-{
-    cudaSetDevice(device->id);
-    if (stream_type == STREAM_ALL) {
-        cudaDeviceSynchronize();
-    }
-    else {
-        cudaStreamSynchronize(device->streams[stream_type]);
-    }
-    return AC_SUCCESS;
-}
-
-static AcResult
-loadWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
-               const size_t bytes, AcReal* dst)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA(
-        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-static AcResult
-storeWithOffset(const Device device, const StreamType stream_type, const AcReal* src,
-                const size_t bytes, AcReal* dst)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA(
-        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshToDevice(const Device device, const StreamType stream_type, const AcMesh& host_mesh,
-                 const int3& src, const int3& dst, const int num_vertices)
-{
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        loadWithOffset(device, stream_type, &host_mesh.vertex_buffer[i][src_idx],
-                       num_vertices * sizeof(AcReal), &device->vba.in[i][dst_idx]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshToHost(const Device device, const StreamType stream_type, const int3& src, const int3& dst,
-               const int num_vertices, AcMesh* host_mesh)
-{
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        storeWithOffset(device, stream_type, &device->vba.in[i][src_idx],
-                        num_vertices * sizeof(AcReal), &host_mesh->vertex_buffer[i][dst_idx]);
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-copyMeshDeviceToDevice(const Device src_device, const StreamType stream_type, const int3& src,
-                       Device dst_device, const int3& dst, const int num_vertices)
-{
-    cudaSetDevice(src_device->id);
-    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
-    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
-
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        ERRCHK_CUDA(cudaMemcpyPeerAsync(&dst_device->vba.in[i][dst_idx], dst_device->id,
-                                        &src_device->vba.in[i][src_idx], src_device->id,
-                                        sizeof(src_device->vba.in[i][0]) * num_vertices,
-                                        src_device->streams[stream_type]));
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-swapBuffers(const Device device)
-{
-    cudaSetDevice(device->id);
-    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
-        AcReal* tmp        = device->vba.in[i];
-        device->vba.in[i]  = device->vba.out[i];
-        device->vba.out[i] = tmp;
-    }
-    return AC_SUCCESS;
-}
-
-AcResult
-loadDeviceConstant(const Device device, const StreamType stream_type, const AcIntParam param,
-                   const int value)
-{
-    cudaSetDevice(device->id);
-    // CUDA 10 apparently creates only a single name for a device constant (d_mesh_info here)
-    // and something like d_mesh_info.real_params[] cannot be directly accessed.
-    // Therefore we have to obfuscate the code a bit and compute the offset address before
-    // invoking cudaMemcpyToSymbol.
-    const size_t offset = (size_t)&d_mesh_info.int_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice,
-                                               device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-loadDeviceConstant(const Device device, const StreamType stream_type, const AcRealParam param,
-                   const AcReal value)
-{
-    cudaSetDevice(device->id);
-    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
-    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
-                                               cudaMemcpyHostToDevice,
-                                               device->streams[stream_type]));
-    return AC_SUCCESS;
-}
-
-AcResult
-loadGlobalGrid(const Device device, const Grid grid)
-{
-    cudaSetDevice(device->id);
-    ERRCHK_CUDA_ALWAYS(
-        cudaMemcpyToSymbol(globalGrid, &grid, sizeof(grid), 0, cudaMemcpyHostToDevice));
-    return AC_SUCCESS;
-}
-
-AcResult
-autoOptimize(const Device device)
-{
-    cudaSetDevice(device->id);
-
-    // RK3
-    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
-    const int3 end   = start + (int3){device->local_config.int_params[AC_nx], //
-                                    device->local_config.int_params[AC_ny], //
-                                    device->local_config.int_params[AC_nz]};
-
-    dim3 best_dims(0, 0, 0);
-    float best_time          = INFINITY;
-    const int num_iterations = 10;
-
-    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
-        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
-            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
-
-                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
-                    break;
-                if (x * y * z > MAX_THREADS_PER_BLOCK)
-                    break;
-
-                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
-                    break;
-
-                if (((x * y * z) % WARP_SIZE) != 0)
-                    continue;
-
-                const dim3 tpb(x, y, z);
-                const int3 n = end - start;
-                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
-                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
-                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
-
-                cudaDeviceSynchronize();
-                if (cudaGetLastError() != cudaSuccess) // resets the error if any
-                    continue;
-
-                // printf("(%d, %d, %d)\n", x, y, z);
-
-                cudaEvent_t tstart, tstop;
-                cudaEventCreate(&tstart);
-                cudaEventCreate(&tstop);
-
-                cudaEventRecord(tstart); // ---------------------------------------- Timing start
-
-                for (int i = 0; i < num_iterations; ++i)
-                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
-
-                cudaEventRecord(tstop); // ----------------------------------------- Timing end
-                cudaEventSynchronize(tstop);
-                float milliseconds = 0;
-                cudaEventElapsedTime(&milliseconds, tstart, tstop);
-
-                ERRCHK_CUDA_KERNEL_ALWAYS();
-                if (milliseconds < best_time) {
-                    best_time = milliseconds;
-                    best_dims = tpb;
-                }
-            }
-        }
-    }
-#if VERBOSE_PRINTING
-    printf(
-        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
-        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
-#endif
-    /*
-    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
-    ERRCHK(fp);
-    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
-    fclose(fp);
-    */
-
-    rk3_tpb = best_dims;
-    return AC_SUCCESS;
-}
-
-#if PACKED_DATA_TRANSFERS
-// Functions for calling packed data transfers
-#endif
-
-/*
- * =============================================================================
- * Revised interface
- * =============================================================================
- */
diff --git a/src/core/grid.cu b/src/core/grid.cu
index e69de29..83c7453 100644
--- a/src/core/grid.cu
+++ b/src/core/grid.cu
@@ -0,0 +1,19 @@
+/*
+    Copyright (C) 2014-2019, Johannes Pekkilae, Miikka Vaeisalae.
+
+    This file is part of Astaroth.
+
+    Astaroth is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Astaroth is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "astaroth_grid.h"
diff --git a/src/core/node.cu b/src/core/node.cu
index c3bfc05..1c3d5cf 100644
--- a/src/core/node.cu
+++ b/src/core/node.cu
@@ -16,7 +16,7 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-// #include "astaroth_node.h"
+#include "astaroth_node.h"
 
 struct node_s {
 };