Added multi-GPU reductions. Tested to work with 1-2 GPUs with power of two grid dimensions. Requires more testing in special cases (when using exotic grid dimensions and a large number of GPUs)

2019-06-17 14:45:41 +03:00
parent 0ce689dbe4
commit 59086b3e79
5 changed files with 385 additions and 31 deletions
--- a/src/core/astaroth.cu
+++ b/src/core/astaroth.cu
@@ -158,16 +158,16 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
    /*
    Here we decompose the host mesh and distribute it among the GPUs in
    the node.
-    
+
    The host mesh is a huge contiguous block of data. Its dimensions are given by
    the global variable named "grid". A "grid" is decomposed into "subgrids",
    one for each GPU. Here we check which parts of the range s0...s1 maps
    to the memory space stored by some GPU, ranging d0...d1, and transfer
    the data if needed.
-    
+
    The index mapping is inherently quite involved, but here's a picture which
    hopefully helps make sense out of all this.
-    
+

    Grid
                                     |----num_vertices---|
@@ -181,8 +181,8 @@ acLoadWithOffset(const AcMesh& host_mesh, const int3& src, const int num_vertice
             ^                   ^
            d0                  d1

-                                 ^   ^                   
-                                db  da                   
+                                 ^   ^
+                                db  da

    */
    for (int i = 0; i < num_devices; ++i) {
@@ -294,7 +294,7 @@ acBoundcondStep(void)

 /*
 // ===MIIKKANOTE START==========================================
-%JP: The old way for computing boundary conditions conflicts with the 
+%JP: The old way for computing boundary conditions conflicts with the
 way we have to do things with multiple GPUs.

 The older approach relied on unified memory, which represented the whole
@@ -303,7 +303,7 @@ in its current state is more meant for quick prototyping when performance is not
 Getting the CUDA driver to migrate data intelligently across GPUs is much more difficult than
 when managing the memory explicitly.

-In this new approach, I have simplified the multi- and single-GPU layers significantly. 
+In this new approach, I have simplified the multi- and single-GPU layers significantly.
 Quick rundown:
 	New struct: Grid. There are two global variables, "grid" and "subgrid", which
 	contain the extents of the whole simulation domain and the decomposed grids, respectively.
@@ -314,7 +314,7 @@ Quick rundown:
 	The whole simulation domain is decomposed with respect to the z dimension.
 	For example, if the grid contains (nx, ny, nz) vertices, then the subgrids
 	contain (nx, ny, nz / num_devices) vertices.
- 
+
 	An local index (i, j, k) in some subgrid can be mapped to the global grid with
 		global idx = (i, j, k + device_id * subgrid.n.z)

@@ -328,35 +328,35 @@ Changes required to this commented code block:
 	  laid out in device memory
 	- The unified memory buffer no longer exists (d_buffer). Instead, we have an opaque handle
 	  of type "Device" which should be passed to single-GPU functions. In this file, all devices
-	  are stored in a global array "devices[num_devices]". 
+	  are stored in a global array "devices[num_devices]".
 	- Every single-GPU function is executed asynchronously by default such that we
 	  can optimize Astaroth by executing memory transactions concurrently with computation.
 	  Therefore a StreamType should be passed as a parameter to single-GPU functions.
 	  Refresher: CUDA function calls are non-blocking when a stream is explicitly passed
-	  as a parameter and commands executing in different streams can be processed 
+	  as a parameter and commands executing in different streams can be processed
 	  in parallel/concurrently.


 Note on periodic boundaries (might be helpful when implementing other boundary conditions):

-	With multiple GPUs, periodic boundary conditions applied on indices ranging from 
-		
+	With multiple GPUs, periodic boundary conditions applied on indices ranging from
+
 		(0, 0, STENCIL_ORDER/2) to (subgrid.m.x, subgrid.m.y, subgrid.m.z - STENCIL_ORDER/2)

 	on a single device are "local", in the sense that they can be computed without having
 	to exchange data with neighboring GPUs. Special care is needed only for transferring
 	the data to the fron and back plates outside this range. In the solution we use here,
 	we solve the local boundaries first, and then just exchange the front and back plates
-	in a "ring", like so 
+	in a "ring", like so
 				device_id
 		    (n) <-> 0 <-> 1 <-> ... <-> n <-> (0)
-			
+

 // ======MIIKKANOTE END==========================================

 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< MIIKKANOTE: This code block was essentially
                                                          moved into device.cu, function boundCondStep()
-                                                          In astaroth.cu, we use acBoundcondStep() 
+                                                          In astaroth.cu, we use acBoundcondStep()
                                                          just to distribute the work and manage
                                                          communication between GPUs.

@@ -364,8 +364,8 @@ Note on periodic boundaries (might be helpful when implementing other boundary c

    exit(0);
    #else
-    
-        
+
+
        const int depth = (int)ceil(mesh_info.int_params[AC_mz]/(float)num_devices);

        const int3 start = (int3){0, 0, device_id * depth};
@@ -378,8 +378,8 @@ Note on periodic boundaries (might be helpful when implementing other boundary c
        // TODO uses the default stream currently
        if (mesh_info.int_params[AC_bc_type] == 666) { // TODO MAKE A BETTER SWITCH
            wedge_boundconds(0, tpb, start, end, d_buffer);
-        } else { 
-            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i) 
+        } else {
+            for (int i = 0; i < NUM_VTXBUF_HANDLES; ++i)
                periodic_boundconds(0, tpb, start, end, d_buffer.in[i]);
 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 */
@@ -424,20 +424,52 @@ acIntegrate(const AcReal& dt)
    return AC_SUCCESS;
 }

+static AcReal
+simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, const int& n)
+{
+    AcReal res = results[0];
+    for (int i = 1; i < n; ++i) {
+        if (rtype == RTYPE_MAX) {
+            res = max(res, results[i]);
+        } else if (rtype == RTYPE_MIN) {
+            res = min(res, results[i]);
+        } else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+            res = sum(res, results[i]);
+        } else {
+            ERROR("Invalid rtype");
+        }
+    }
+
+    if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
+        const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
+        res = sqrt(inv_n * res);
+    }
+
+    return res;
+}
+
 AcReal
 acReduceScal(const ReductionType& rtype,
             const VertexBufferHandle& vtxbuffer_handle)
 {
-    // TODO
-    return 0;
+    AcReal results[num_devices];
+    for (int i = 0; i < num_devices; ++i) {
+        reduceScal(devices[i], STREAM_PRIMARY, rtype, vtxbuffer_handle, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
 }

 AcReal
 acReduceVec(const ReductionType& rtype, const VertexBufferHandle& a,
            const VertexBufferHandle& b, const VertexBufferHandle& c)
 {
-    // TODO
-    return 0;
+    AcReal results[num_devices];
+    for (int i = 0; i < num_devices; ++i) {
+        reduceVec(devices[i], STREAM_PRIMARY, rtype, a, b, c, &results[i]);
+    }
+
+    return simple_final_reduce_scal(rtype, results, num_devices);
 }

 AcResult
--- a/src/core/device.cu
+++ b/src/core/device.cu
@@ -37,6 +37,7 @@ __constant__ AcMeshInfo d_mesh_info;
 #define DCONST_INT(X)  (d_mesh_info.int_params[X])
 #define DCONST_REAL(X) (d_mesh_info.real_params[X])
 #define DEVICE_VTXBUF_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_mx) + (k)*DCONST_INT(AC_mxy))
+#define DEVICE_1D_COMPDOMAIN_IDX(i, j, k) ((i) + (j)*DCONST_INT(AC_nx) + (k)*DCONST_INT(AC_nxy))
 #include "kernels/kernels.cuh"

 struct device_s {
@@ -118,7 +119,7 @@ AcResult
 createDevice(const int id, const AcMeshInfo device_config, Device* device_handle)
 {
    cudaSetDevice(id);
-    cudaDeviceReset();
+    cudaDeviceReset(); 

    // Create Device
    struct device_s* device = (struct device_s*) malloc(sizeof(*device));
@@ -194,16 +195,40 @@ boundcondStep(const Device device, const StreamType stream_type, const int3& sta
 }

 AcResult
-reduceScal(const Device device)
+reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
+           const VertexBufferHandle vtxbuf_handle, AcReal* result)
 {
    cudaSetDevice(device->id);
+
+    *result = reduce_scal(device->streams[stream_type], rtype,
+                          device->local_config.int_params[AC_nx],
+                          device->local_config.int_params[AC_ny],
+                          device->local_config.int_params[AC_nz],
+                          device->vba.in[vtxbuf_handle],
+                          device->reduce_scratchpad, device->reduce_result);
+
    return AC_SUCCESS;
 }

 AcResult
-reduceVec(const Device device)
+reduceVec(const Device device, const StreamType stream_type,
+          const ReductionType rtype,
+          const VertexBufferHandle vec0,
+          const VertexBufferHandle vec1,
+          const VertexBufferHandle vec2,
+          AcReal* result)
 {
    cudaSetDevice(device->id);
+
+    *result = reduce_vec(device->streams[stream_type], rtype,
+                         device->local_config.int_params[AC_nx],
+                         device->local_config.int_params[AC_ny],
+                         device->local_config.int_params[AC_nz],
+                         device->vba.in[vec0],
+                         device->vba.in[vec1],
+                         device->vba.in[vec2],
+                         device->reduce_scratchpad, device->reduce_result);
+
    return AC_SUCCESS;
 }

--- a/src/core/device.cuh
+++ b/src/core/device.cuh
@@ -51,10 +51,16 @@ AcResult boundcondStep(const Device device, const StreamType stream_type,
                       const int3& start, const int3& end);

 /** */
-AcResult reduceScal(const Device device);
+AcResult reduceScal(const Device device, const StreamType stream_type, const ReductionType rtype,
+                    const VertexBufferHandle vtxbuf_handle, AcReal* result);

 /** */
-AcResult reduceVec(const Device device);
+AcResult reduceVec(const Device device, const StreamType stream_type,
+                   const ReductionType rtype,
+                   const VertexBufferHandle vec0,
+                   const VertexBufferHandle vec1,
+                   const VertexBufferHandle vec2,
+                   AcReal* result);

 /** */
 AcResult rkStep(const Device device, const StreamType stream_type, const int step_number,
--- a/src/core/kernels/kernels.cuh
+++ b/src/core/kernels/kernels.cuh
@@ -754,7 +754,7 @@ randf(void)
 }

 AcResult
-rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end, 
+rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& start, const int3& end,
               const AcReal dt, VertexBufferArray* buffer)
 {
    const dim3 tpb(32, 1, 4);
@@ -792,3 +792,294 @@ rk3_step_async(const cudaStream_t stream, const int& step_number, const int3& st
    ERRCHK_CUDA_KERNEL();
    return AC_SUCCESS;
 }
+
+
+////////////////REDUCE///////////////////////////
+#include "src/core/math_utils.h" // is_power_of_two
+
+// Function pointer definitions
+typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
+typedef AcReal (*ReduceInitialScalFunc)(const AcReal&);
+typedef AcReal (*ReduceInitialVecFunc)(const AcReal&, const AcReal&,
+                                       const AcReal&);
+
+// clang-format off
+/* Comparison funcs */
+__device__ inline AcReal
+_device_max(const AcReal& a, const AcReal& b) { return a > b ? a : b; }
+
+__device__ inline AcReal
+_device_min(const AcReal& a, const AcReal& b) { return a < b ? a : b; }
+
+__device__ inline AcReal
+_device_sum(const AcReal& a, const AcReal& b) { return a + b; }
+
+/* Function used to determine the values used during reduction */
+__device__ inline AcReal
+_device_length_scal(const AcReal& a) { return AcReal(a); }
+
+__device__ inline AcReal
+_device_squared_scal(const AcReal& a) { return (AcReal)(a*a); }
+
+__device__ inline AcReal
+_device_exp_squared_scal(const AcReal& a) { return exp(a)*exp(a); }
+
+__device__ inline AcReal
+_device_length_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return sqrt(a*a + b*b + c*c); }
+
+__device__ inline AcReal
+_device_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_squared_scal(a) + _device_squared_scal(b) + _device_squared_scal(c); }
+
+__device__ inline AcReal
+_device_exp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return _device_exp_squared_scal(a) + _device_exp_squared_scal(b) + _device_exp_squared_scal(c); }
+// clang-format on
+
+__device__ inline bool
+oob(const int& i, const int& j, const int& k)
+{
+    if (i >= d_mesh_info.int_params[AC_nx] ||
+        j >= d_mesh_info.int_params[AC_ny] ||
+        k >= d_mesh_info.int_params[AC_nz])
+        return true;
+    else
+        return false;
+}
+
+template <ReduceInitialScalFunc reduce_initial>
+__global__ void
+_kernel_reduce_scal(const __restrict__ AcReal* src, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src[src_idx]);
+}
+
+template <ReduceInitialVecFunc reduce_initial>
+__global__ void
+_kernel_reduce_vec(const __restrict__ AcReal* src_a,
+                   const __restrict__ AcReal* src_b,
+                   const __restrict__ AcReal* src_c, AcReal* dst)
+{
+    const int i = threadIdx.x + blockIdx.x * blockDim.x;
+    const int j = threadIdx.y + blockIdx.y * blockDim.y;
+    const int k = threadIdx.z + blockIdx.z * blockDim.z;
+
+    if (oob(i, j, k))
+        return;
+
+    const int src_idx = DEVICE_VTXBUF_IDX(
+        i + d_mesh_info.int_params[AC_nx_min],
+        j + d_mesh_info.int_params[AC_ny_min],
+        k + d_mesh_info.int_params[AC_nz_min]);
+    const int dst_idx = DEVICE_1D_COMPDOMAIN_IDX(i, j, k);
+
+    dst[dst_idx] = reduce_initial(src_a[src_idx], src_b[src_idx],
+                                  src_c[src_idx]);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+#define BLOCK_SIZE (1024)
+#define ELEMS_PER_THREAD (32)
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce(AcReal* src, AcReal* result)
+{
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+
+    if (idx >= scratchpad_size)
+        return;
+
+    __shared__ AcReal smem[BLOCK_SIZE];
+
+    AcReal tmp = src[idx];
+
+    for (int i = 1; i < ELEMS_PER_THREAD; ++i) {
+        const int src_idx = idx + i * BLOCK_SIZE;
+        if (src_idx >= scratchpad_size) {
+            // This check is for safety: if accessing uninitialized values
+            // beyond the mesh boundaries, we will immediately start seeing NANs
+            if (threadIdx.x < BLOCK_SIZE)
+                smem[threadIdx.x] = NAN;
+            else
+                break;
+        }
+        tmp = reduce(tmp, src[src_idx]);
+    }
+
+    smem[threadIdx.x] = tmp;
+    __syncthreads();
+
+    int offset = BLOCK_SIZE / 2;
+    while (offset > 0) {
+
+        if (threadIdx.x < offset) {
+            tmp               = reduce(tmp, smem[threadIdx.x + offset]);
+            smem[threadIdx.x] = tmp;
+        }
+        offset /= 2;
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        src[idx] = tmp;
+}
+
+template <ReduceFunc reduce>
+__global__ void
+_kernel_reduce_block(const __restrict__ AcReal* src, AcReal* result)
+{
+    const int scratchpad_size = DCONST_INT(AC_nxyz);
+    const int idx = threadIdx.x + blockIdx.x * BLOCK_SIZE * ELEMS_PER_THREAD;
+    AcReal tmp    = src[idx];
+    const int block_offset = BLOCK_SIZE * ELEMS_PER_THREAD;
+    for (int i = 1; idx + i * block_offset < scratchpad_size; ++i)
+        tmp = reduce(tmp, src[idx + i * block_offset]);
+
+    *result = tmp;
+}
+//////////////////////////////////////////////////////////////////////////////
+
+AcReal
+reduce_scal(const cudaStream_t stream,
+             const ReductionType& rtype, const int& nx, const int& ny,
+             const int& nz, const AcReal* vertex_buffer,
+             AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(AcReal(nx) / tpb.x)), int(ceil(AcReal(ny) / tpb.y)),
+                   int(ceil(AcReal(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(AcReal(scratchpad_size) /
+                                        AcReal(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_scal<_device_length_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_scal<_device_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_scal<_device_exp_squared_scal>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer, reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
+
+AcReal
+reduce_vec(const cudaStream_t stream,
+            const ReductionType& rtype, const int& nx, const int& ny,
+            const int& nz, const AcReal* vertex_buffer_a,
+            const AcReal* vertex_buffer_b, const AcReal* vertex_buffer_c,
+            AcReal* reduce_scratchpad, AcReal* reduce_result)
+{
+    const dim3 tpb(32, 4, 1);
+    const dim3 bpg(int(ceil(float(nx) / tpb.x)),
+                   int(ceil(float(ny) / tpb.y)),
+                   int(ceil(float(nz) / tpb.z)));
+
+    const int scratchpad_size = nx * ny * nz;
+    const int bpg2            = (unsigned int)ceil(float(scratchpad_size) /
+                                        float(ELEMS_PER_THREAD * BLOCK_SIZE));
+
+    // "Features" of this quick & efficient reduction:
+    // Block size must be smaller than the computational domain size
+    // (otherwise we would have do some additional bounds checking in the
+    // second half of _kernel_reduce, which gets quite confusing)
+    // Also the BLOCK_SIZE must be a multiple of two s.t. we can easily split
+    // the work without worrying too much about the array bounds.
+    ERRCHK_ALWAYS(BLOCK_SIZE <= scratchpad_size);
+    ERRCHK_ALWAYS(!(BLOCK_SIZE % 2));
+    // NOTE! Also does not work properly with non-power of two mesh dimension
+    // Issue is with "smem[BLOCK_SIZE];". If you init smem to NANs, you can
+    // see that uninitialized smem values are used in the comparison
+    ERRCHK_ALWAYS(is_power_of_two(nx));
+    ERRCHK_ALWAYS(is_power_of_two(ny));
+    ERRCHK_ALWAYS(is_power_of_two(nz));
+
+    switch (rtype) {
+    case RTYPE_MAX:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_max>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_max>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_MIN:
+        _kernel_reduce_vec<_device_length_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_min>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_min>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS:
+        _kernel_reduce_vec<_device_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    case RTYPE_RMS_EXP:
+        _kernel_reduce_vec<_device_exp_squared_vec>
+            <<<bpg, tpb, 0, stream>>>(vertex_buffer_a, vertex_buffer_b, vertex_buffer_c,
+                           reduce_scratchpad);
+        _kernel_reduce<_device_sum>
+            <<<bpg2, BLOCK_SIZE, 0, stream>>>(reduce_scratchpad, reduce_result);
+        _kernel_reduce_block<_device_sum>
+            <<<1, 1, 0, stream>>>(reduce_scratchpad, reduce_result);
+        break;
+    default:
+        ERROR("Unrecognized RTYPE");
+    }
+
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
--- a/src/standalone/autotest.cc
+++ b/src/standalone/autotest.cc
@@ -478,7 +478,7 @@ run_autotest(void)
 	//const vec3i test_dims[] = {{32, 32, 32}};

 	int num_failures = 0;
-	/*for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
+	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
 		config.int_params[AC_nx] = test_dims[i].x;
 		config.int_params[AC_ny] = test_dims[i].y;
 		config.int_params[AC_nz] = test_dims[i].z;
@@ -489,7 +489,7 @@ run_autotest(void)

 		num_failures += check_reductions(config);
 		fflush(stdout);
-	}*/ // TODO uncomment
+	}

 	for (size_t i = 0; i < ARRAY_SIZE(test_dims); ++i) {
 		config.int_params[AC_nx] = test_dims[i].x;