diff --git a/include/astaroth_defines.h b/include/astaroth_defines.h
index 7aa9751..1441967 100644
--- a/include/astaroth_defines.h
+++ b/include/astaroth_defines.h
@@ -105,6 +105,7 @@ typedef enum {
 } ReductionType;
 
 typedef enum { STREAM_DEFAULT, NUM_STREAM_TYPES } Stream;
+#define STREAM_ALL (-1)
 
 #define AC_GEN_ID(X) X
 typedef enum {
diff --git a/include/astaroth_device.h b/include/astaroth_device.h
index 48c8db2..929b591 100644
--- a/include/astaroth_device.h
+++ b/include/astaroth_device.h
@@ -88,7 +88,7 @@ AcResult acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* hos
 AcResult acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
                                                 const VertexBufferHandle vtxbuf_handle,
                                                 const int3 src, const int3 dst,
-                                                const int num_vertices, Device* dst_device);
+                                                const int num_vertices, Device dst_device);
 
 /** */
 AcResult acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream,
@@ -106,15 +106,21 @@ AcResult acDeviceTransferMesh(const Device src_device, const Stream stream, Devi
 AcResult acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
                                   const int3 start, const int3 end, const AcReal dt);
 /** */
-AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream, const int3 start,
+AcResult acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3 start,
                                        const int3 end);
+
+/** */
+AcResult acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                                    const int3 end);
+
 /** */
 AcResult acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
                             const VertexBufferHandle vtxbuf_handle, AcReal* result);
 /** */
-AcResult acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
-                           const VertexBufferHandle vec0, const VertexBufferHandle vec1,
-                           const VertexBufferHandle vec2, AcReal* result);
+AcResult acDeviceReduceVec(const Device device, const Stream stream_type, const ReductionType rtype,
+                           const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                           const VertexBufferHandle vtxbuf2, AcReal* result);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/core/device.cu b/src/core/device.cu
index 5d7dd69..dbee2cc 100644
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -16,7 +16,602 @@
     You should have received a copy of the GNU General Public License
     along with Astaroth.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include "astaroth_device.h"
+
+/**
+ * @file
+ * \brief Brief info.
+ *
+ * Detailed info.
+ *
+ */
+#include "astaroth_device.cuh"
+
+#include "errchk.h"
+
+// Device info
+#define REGISTERS_PER_THREAD (255)
+#define MAX_REGISTERS_PER_BLOCK (65536)
+#define MAX_THREADS_PER_BLOCK (1024)
+#define WARP_SIZE (32)
+
+typedef struct {
+    AcReal* in[NUM_VTXBUF_HANDLES];
+    AcReal* out[NUM_VTXBUF_HANDLES];
+} VertexBufferArray;
+
+__constant__ AcMeshInfo d_mesh_info;
+__constant__ int3 d_multigpu_offset;
+__constant__ Grid globalGrid;
+#define DCONST_INT(X) (d_mesh_info.int_params[X])
+#define DCONST_INT3(X) (d_mesh_info.int3_params[X])
+#define DCONST_REAL(X) (d_mesh_info.real_params[X])
+#define DCONST_REAL3(X) (d_mesh_info.real3_params[X])
+#define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
+#include "kernels/kernels.cuh"
+
+static dim3 rk3_tpb = (dim3){32, 1, 4};
+
+#if PACKED_DATA_TRANSFERS // Defined in device.cuh
+// #include "kernels/pack_unpack.cuh"
+#endif
 
 struct device_s {
+    int id;
+    AcMeshInfo local_config;
+
+    // Concurrency
+    cudaStream_t streams[NUM_STREAM_TYPES];
+
+    // Memory
+    VertexBufferArray vba;
+    AcReal* reduce_scratchpad;
+    AcReal* reduce_result;
+
+#if PACKED_DATA_TRANSFERS
+// Declare memory for buffers needed for packed data transfers here
+// AcReal* data_packing_buffer;
+#endif
 };
+
+static __global__ void
+dummy_kernel(void)
+{
+}
+
+AcResult
+acDeviceCreate(const int id, const AcMeshInfo device_config, Device* device_handle)
+{
+    cudaSetDevice(id);
+    cudaDeviceReset();
+
+    // Create Device
+    struct device_s* device = (struct device_s*)malloc(sizeof(*device));
+    ERRCHK_ALWAYS(device);
+
+    device->id           = id;
+    device->local_config = device_config;
+
+    // Check that the code was compiled for the proper GPU architecture
+    printf("Trying to run a dummy kernel. If this fails, make sure that your\n"
+           "device supports the CUDA architecture you are compiling for.\n"
+           "Running dummy kernel... ");
+    fflush(stdout);
+    dummy_kernel<<<1, 1>>>();
+    ERRCHK_CUDA_KERNEL_ALWAYS();
+    printf("Success!\n");
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamCreateWithPriority(&device->streams[i], cudaStreamNonBlocking, 0);
+    }
+
+    // Memory
+    const size_t vba_size_bytes = acVertexBufferSizeBytes(device_config);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.in[i], vba_size_bytes));
+        ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->vba.out[i], vba_size_bytes));
+    }
+    ERRCHK_CUDA_ALWAYS(
+        cudaMalloc(&device->reduce_scratchpad, acVertexBufferCompdomainSizeBytes(device_config)));
+    ERRCHK_CUDA_ALWAYS(cudaMalloc(&device->reduce_result, sizeof(AcReal)));
+
+#if PACKED_DATA_TRANSFERS
+// Allocate data required for packed transfers here (cudaMalloc)
+#endif
+
+    // Device constants
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_mesh_info, &device_config, sizeof(device_config), 0,
+                                          cudaMemcpyHostToDevice));
+
+    // Multi-GPU offset. This is used to compute globalVertexIdx.
+    // Might be better to calculate this in astaroth.cu instead of here, s.t.
+    // everything related to the decomposition is limited to the multi-GPU layer
+    const int3 multigpu_offset = (int3){0, 0, device->id * device->local_config.int_params[AC_nz]};
+    ERRCHK_CUDA_ALWAYS(cudaMemcpyToSymbol(d_multigpu_offset, &multigpu_offset,
+                                          sizeof(multigpu_offset), 0, cudaMemcpyHostToDevice));
+
+    printf("Created device %d (%p)\n", device->id, device);
+    *device_handle = device;
+
+    // Autoptimize
+    if (id == 0) {
+        autoOptimize(device);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceDestroy(Device device)
+{
+    cudaSetDevice(device->id);
+    printf("Destroying device %d (%p)\n", device->id, device);
+
+    // Memory
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        cudaFree(device->vba.in[i]);
+        cudaFree(device->vba.out[i]);
+    }
+    cudaFree(device->reduce_scratchpad);
+    cudaFree(device->reduce_result);
+
+#if PACKED_DATA_TRANSFERS
+// Free data required for packed tranfers here (cudaFree)
+#endif
+
+    // Concurrency
+    for (int i = 0; i < NUM_STREAM_TYPES; ++i) {
+        cudaStreamDestroy(device->streams[i]);
+    }
+
+    // Destroy Device
+    free(device);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePrintInfo(const Device device)
+{
+    const int device_id = device->id;
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device_id);
+    printf("--------------------------------------------------\n");
+    printf("Device Number: %d\n", device_id);
+    const size_t bus_id_max_len = 128;
+    char bus_id[bus_id_max_len];
+    cudaDeviceGetPCIBusId(bus_id, bus_id_max_len, device_id);
+    printf("  PCI bus ID: %s\n", bus_id);
+    printf("    Device name: %s\n", props.name);
+    printf("    Compute capability: %d.%d\n", props.major, props.minor);
+
+    // Compute
+    printf("  Compute\n");
+    printf("    Clock rate (GHz): %g\n", props.clockRate / 1e6); // KHz -> GHz
+    printf("    Stream processors: %d\n", props.multiProcessorCount);
+    printf("    SP to DP flops performance ratio: %d:1\n", props.singleToDoublePrecisionPerfRatio);
+    printf(
+        "    Compute mode: %d\n",
+        (int)props
+            .computeMode); // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g7eb25f5413a962faad0956d92bae10d0
+    // Memory
+    printf("  Global memory\n");
+    printf("    Memory Clock Rate (MHz): %d\n", props.memoryClockRate / (1000));
+    printf("    Memory Bus Width (bits): %d\n", props.memoryBusWidth);
+    printf("    Peak Memory Bandwidth (GiB/s): %f\n",
+           2 * (props.memoryClockRate * 1e3) * props.memoryBusWidth / (8. * 1024. * 1024. * 1024.));
+    printf("    ECC enabled: %d\n", props.ECCEnabled);
+
+    // Memory usage
+    size_t free_bytes, total_bytes;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    const size_t used_bytes = total_bytes - free_bytes;
+    printf("    Total global mem: %.2f GiB\n", props.totalGlobalMem / (1024.0 * 1024 * 1024));
+    printf("    Gmem used (GiB): %.2f\n", used_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory free (GiB): %.2f\n", free_bytes / (1024.0 * 1024 * 1024));
+    printf("    Gmem memory total (GiB): %.2f\n", total_bytes / (1024.0 * 1024 * 1024));
+    printf("  Caches\n");
+    printf("    Local L1 cache supported: %d\n", props.localL1CacheSupported);
+    printf("    Global L1 cache supported: %d\n", props.globalL1CacheSupported);
+    printf("    L2 size: %d KiB\n", props.l2CacheSize / (1024));
+    // MV: props.totalConstMem and props.sharedMemPerBlock cause assembler error
+    // MV: while compiling in TIARA gp cluster. Therefore commeted out.
+    //!!    printf("    Total const mem: %ld KiB\n", props.totalConstMem / (1024));
+    //!!    printf("    Shared mem per block: %ld KiB\n", props.sharedMemPerBlock / (1024));
+    printf("  Other\n");
+    printf("    Warp size: %d\n", props.warpSize);
+    // printf("    Single to double perf. ratio: %dx\n",
+    // props.singleToDoublePrecisionPerfRatio); //Not supported with older CUDA
+    // versions
+    printf("    Stream priorities supported: %d\n", props.streamPrioritiesSupported);
+    printf("--------------------------------------------------\n");
+
+    return AC_SUCCESS;
+}
+
+AcResult
+autoOptimize(const Device device)
+{
+    cudaSetDevice(device->id);
+
+    // RK3
+    const int3 start = (int3){NGHOST, NGHOST, NGHOST};
+    const int3 end   = start + (int3){device->local_config.int_params[AC_nx], //
+                                    device->local_config.int_params[AC_ny], //
+                                    device->local_config.int_params[AC_nz]};
+
+    dim3 best_dims(0, 0, 0);
+    float best_time          = INFINITY;
+    const int num_iterations = 10;
+
+    for (int z = 1; z <= MAX_THREADS_PER_BLOCK; ++z) {
+        for (int y = 1; y <= MAX_THREADS_PER_BLOCK; ++y) {
+            for (int x = WARP_SIZE; x <= MAX_THREADS_PER_BLOCK; x += WARP_SIZE) {
+
+                if (x > end.x - start.x || y > end.y - start.y || z > end.z - start.z)
+                    break;
+                if (x * y * z > MAX_THREADS_PER_BLOCK)
+                    break;
+
+                if (x * y * z * REGISTERS_PER_THREAD > MAX_REGISTERS_PER_BLOCK)
+                    break;
+
+                if (((x * y * z) % WARP_SIZE) != 0)
+                    continue;
+
+                const dim3 tpb(x, y, z);
+                const int3 n = end - start;
+                const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                               (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                               (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+                cudaDeviceSynchronize();
+                if (cudaGetLastError() != cudaSuccess) // resets the error if any
+                    continue;
+
+                // printf("(%d, %d, %d)\n", x, y, z);
+
+                cudaEvent_t tstart, tstop;
+                cudaEventCreate(&tstart);
+                cudaEventCreate(&tstop);
+
+                cudaEventRecord(tstart); // ---------------------------------------- Timing start
+
+                for (int i = 0; i < num_iterations; ++i)
+                    solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
+
+                cudaEventRecord(tstop); // ----------------------------------------- Timing end
+                cudaEventSynchronize(tstop);
+                float milliseconds = 0;
+                cudaEventElapsedTime(&milliseconds, tstart, tstop);
+
+                ERRCHK_CUDA_KERNEL_ALWAYS();
+                if (milliseconds < best_time) {
+                    best_time = milliseconds;
+                    best_dims = tpb;
+                }
+            }
+        }
+    }
+#if VERBOSE_PRINTING
+    printf(
+        "Auto-optimization done. The best threadblock dimensions for rkStep: (%d, %d, %d) %f ms\n",
+        best_dims.x, best_dims.y, best_dims.z, double(best_time) / num_iterations);
+#endif
+    /*
+    FILE* fp = fopen("../config/rk3_tbdims.cuh", "w");
+    ERRCHK(fp);
+    fprintf(fp, "%d, %d, %d\n", best_dims.x, best_dims.y, best_dims.z);
+    fclose(fp);
+    */
+
+    rk3_tpb = best_dims;
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceSynchronizeStream(const Device device, const Stream stream)
+{
+    cudaSetDevice(device->id);
+    if (stream == STREAM_ALL) {
+        cudaDeviceSynchronize();
+    }
+    else {
+        cudaStreamSynchronize(device->streams[stream]);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceSwapBuffers(const Device device)
+{
+    cudaSetDevice(device->id);
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        AcReal* tmp        = device->vba.in[i];
+        device->vba.in[i]  = device->vba.out[i];
+        device->vba.out[i] = tmp;
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadConstant(const Device device, const Stream stream, const AcRealParam param,
+                     const AcReal value)
+{
+    cudaSetDevice(device->id);
+    const size_t offset = (size_t)&d_mesh_info.real_params[param] - (size_t)&d_mesh_info;
+    ERRCHK_CUDA(cudaMemcpyToSymbolAsync(d_mesh_info, &value, sizeof(value), offset,
+                                        cudaMemcpyHostToDevice, device->streams[stream]));
+    return AC_SUCCESS;
+}
+
+static AcResult
+load_with_offset(const Device device, const Stream stream, const AcReal* src, const size_t bytes,
+                 AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(                                                                          //
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, device->streams[stream]) //
+    );
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadVertexBufferWithOffset(const Device device, const Stream stream, const AcMeshhost_mesh,
+                                   const VertexBufferHandle vtxbuf_handle, const int3src,
+                                   const int3dst, const int num_vertices)
+{
+    cudaSetDevice(device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, host_mesh.info);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, device->local_config);
+    load_with_offset(device, stream, &host_mesh.vertex_buffer[vtxbuf_handle][src_idx],
+                     num_vertices * sizeof(AcReal), &device->vba.in[vtxbuf_handle][dst_idx]);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadMeshWithOffset(const Device device, const Stream stream, const AcMeshhost_mesh,
+                           const int3src, const int3dst, const int num_vertices)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceLoadVertexBufferWithOffset(device, stream, host_mesh, (VertexBufferHandle)i, src,
+                                           dst, num_vertices);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadVertexBuffer(const Device device, const Stream stream, const AcMesh host_mesh,
+                         const VertexBufferHandle vtxbuf_handle)
+{
+    int3 src                  = (int3){0, 0, 0};
+    int3 dst                  = src;
+    const size_t num_vertices = acVertexBufferSize(device->local_config);
+    acLoadVertexBufferWithOffset(device, stream, host_mesh, vtxbuf_handle, src, dst, num_vertices);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceLoadMesh(const Device device, const Stream stream, const AcMesh host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceLoadVertexBuffer(device, stream, host_mesh, (VertexBufferHandle)i);
+    }
+
+    return AC_SUCCESS;
+}
+
+static AcResult
+store_with_offset(const Device device, const Stream stream, const AcReal* src, const size_t bytes,
+                  AcReal* dst)
+{
+    cudaSetDevice(device->id);
+    ERRCHK_CUDA(                                                                          //
+        cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, device->streams[stream]) //
+    );
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreVertexBufferWithOffset(const Device device, const Stream stream,
+                                    const VertexBufferHandle vtxbuf_handle, const int3 src,
+                                    const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    cudaSetDevice(device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, device->local_config);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, host_mesh->info);
+    store_with_offset(device, stream, &device->vba.in[vtxbuf_handle][src_idx],
+                      num_vertices * sizeof(AcReal),
+                      &host_mesh->vertex_buffer[vtxbuf_handle][dst_idx]);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreMeshWithOffset(const Device device, const Stream stream, const int3 src,
+                            const int3 dst, const int num_vertices, AcMesh* host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceStoreVertexBufferWithOffset(device, stream, (VertexBufferHandle)i, src, dst,
+                                            num_vertices, host_mesh);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreVertexBuffer(const Device device, const Stream stream,
+                          const VertexBufferHandle vtxbuf_handle, AcMesh* host_mesh)
+{
+    int3 src                  = (int3){0, 0, 0};
+    int3 dst                  = src;
+    const size_t num_vertices = acVertexBufferSize(device->local_config);
+
+    acDeviceStoreVertexBufferWithOffset(device, stream, vtxbuf_handle, src, dst, num_vertices,
+                                        host_mesh);
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceStoreMesh(const Device device, const Stream stream, AcMesh* host_mesh)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceStoreVertexBuffer(device, stream, (VertexBufferHandle)i, host_mesh);
+    }
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferVertexBufferWithOffset(const Device src_device, const Stream stream,
+                                       const VertexBufferHandle vtxbuf_handle, const int3src,
+                                       const int3dst, const int num_vertices, Device dst_device)
+{
+    cudaSetDevice(src_device->id);
+    const size_t src_idx = acVertexBufferIdx(src.x, src.y, src.z, src_device->local_config);
+    const size_t dst_idx = acVertexBufferIdx(dst.x, dst.y, dst.z, dst_device->local_config);
+
+    ERRCHK_CUDA( //
+        cudaMemcpyPeerAsync(&dst_device->vba.in[vtxbuf_handle][dst_idx], dst_device->id,
+                            &src_device->vba.in[vtxbuf_handle][src_idx], src_device->id,
+                            sizeof(src_device->vba.in[vtxbuf_handle][0]) * num_vertices,
+                            src_device->streams[stream]) //
+    );
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferMeshWithOffset(const Device src_device, const Stream stream, const int3src,
+                               const int3dst, const int num_vertices, Device dst_device)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                               num_vertices, dst_device);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferVertexBuffer(const Device src_device, const Stream stream,
+                             const VertexBufferHandle vtxbuf_handle, Device dst_device)
+{
+    int3 src                  = (int3){0, 0, 0};
+    int3 dst                  = src;
+    const size_t num_vertices = acVertexBufferSize(device->local_config);
+
+    acDeviceTransferVertexBufferWithOffset(src_device, stream, vtxbuf_handle, src, dst,
+                                           num_vertices, dst_device);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceTransferMesh(const Device src_device, const Stream stream, Device* dst_device)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDeviceTransferVertexBuffer(src_device, stream, vtxbuf_handle, (VertexBufferHandle)i, src,
+                                     dst, num_vertices, dst_device);
+    }
+}
+
+AcResult
+acDeviceIntegrateSubstep(const Device device, const Stream stream, const int step_number,
+                         const int3 start, const int3 end, const AcReal dt)
+{
+    cudaSetDevice(device->id);
+    const cudaStream_t stream = device->streams[stream];
+
+    const dim3 tpb = rk3_tpb;
+
+    const int3 n = end - start;
+    const dim3 bpg((unsigned int)ceil(n.x / AcReal(tpb.x)), //
+                   (unsigned int)ceil(n.y / AcReal(tpb.y)), //
+                   (unsigned int)ceil(n.z / AcReal(tpb.z)));
+
+    if (step_number == 0)
+        solve<0><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+    else if (step_number == 1)
+        solve<1><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+    else
+        solve<2><<<bpg, tpb, 0, stream>>>(start, end, device->vba, dt);
+
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePeriodicBoundcondStep(const Device device, const Stream stream,
+                              const VertexBufferHandle vtxbuf_handle, const int3 start,
+                              const int3 end)
+{
+    cudaSetDevice(device->id);
+    const cudaStream_t stream = device->streams[stream];
+
+    const dim3 tpb(8, 2, 8);
+    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
+                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
+                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
+
+    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, device->vba.in[vtxbuf_handle]);
+    ERRCHK_CUDA_KERNEL();
+
+    return AC_SUCCESS;
+}
+
+AcResult
+acDevicePeriodicBoundconds(const Device device, const Stream stream, const int3 start,
+                           const int3 end)
+{
+    for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) {
+        acDevicePeriodicBoundcondStep(device, stream, (VertexBufferHandle)i, start, end);
+    }
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceReduceScal(const Device device, const Stream stream, const ReductionType rtype,
+                   const VertexBufferHandle vtxbuf_handle, AcReal* result)
+{
+    cudaSetDevice(device->id);
+
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
+                              device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_nz_min]};
+
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
+                            device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_nz_max]};
+
+    *result = reduce_scal(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf_handle],
+                          device->reduce_scratchpad, device->reduce_result);
+    return AC_SUCCESS;
+}
+
+AcResult
+acDeviceReduceVec(const Device device, const Stream stream, const ReductionType rtype,
+                  const VertexBufferHandle vtxbuf0, const VertexBufferHandle vtxbuf1,
+                  const VertexBufferHandle vtxbuf2, AcReal* result)
+{
+    cudaSetDevice(device->id);
+
+    const int3 start = (int3){device->local_config.int_params[AC_nx_min],
+                              device->local_config.int_params[AC_ny_min],
+                              device->local_config.int_params[AC_nz_min]};
+
+    const int3 end = (int3){device->local_config.int_params[AC_nx_max],
+                            device->local_config.int_params[AC_ny_max],
+                            device->local_config.int_params[AC_nz_max]};
+
+    *result = reduce_vec(device->streams[stream], rtype, start, end, device->vba.in[vtxbuf0],
+                         device->vba.in[vtxbuf1], device->vba.in[vtxbuf2],
+                         device->reduce_scratchpad, device->reduce_result);
+    return AC_SUCCESS;
+}
+
+#if PACKED_DATA_TRANSFERS
+// Functions for calling packed data transfers
+#endif
diff --git a/src/core/kernels/kernels.cuh b/src/core/kernels/kernels.cuh
index e8e4042..7a4e54b 100644
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -74,18 +74,6 @@ kernel_periodic_boundconds(const int3 start, const int3 end, AcReal* vtxbuf)
     vtxbuf[dst_idx]   = vtxbuf[src_idx];
 }
 
-void
-periodic_boundconds(const cudaStream_t stream, const int3& start, const int3& end, AcReal* vtxbuf)
-{
-    const dim3 tpb(8, 2, 8);
-    const dim3 bpg((unsigned int)ceil((end.x - start.x) / (float)tpb.x),
-                   (unsigned int)ceil((end.y - start.y) / (float)tpb.y),
-                   (unsigned int)ceil((end.z - start.z) / (float)tpb.z));
-
-    kernel_periodic_boundconds<<<bpg, tpb, 0, stream>>>(start, end, vtxbuf);
-    ERRCHK_CUDA_KERNEL();
-}
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include <assert.h>