diff --git a/src/core/kernels/kernels.h b/src/core/kernels/kernels.h
index 577567b..fc9c745 100644
--- a/src/core/kernels/kernels.h
+++ b/src/core/kernels/kernels.h
@@ -72,6 +72,12 @@ AcReal acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, c
                          const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
                          const AcReal* vtxbuf2, AcReal* scratchpad, AcReal* reduce_result);
 
+/** */
+AcReal acKernelReduceVecScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                             const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
+                             const AcReal* vtxbuf2, const AcReal* vtxbuf3, AcReal* scratchpad, AcReal* reduce_result);
+
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/src/core/kernels/reductions.cuh b/src/core/kernels/reductions.cuh
index 1f40df3..ba9e823 100644
--- a/src/core/kernels/reductions.cuh
+++ b/src/core/kernels/reductions.cuh
@@ -10,6 +10,7 @@ Reduction steps:
 // Function pointer definitions
 typedef AcReal (*FilterFunc)(const AcReal&);
 typedef AcReal (*FilterFuncVec)(const AcReal&, const AcReal&, const AcReal&);
+typedef AcReal (*FilterFuncVecScal)(const AcReal&, const AcReal&, const AcReal&, const AcReal&);
 typedef AcReal (*ReduceFunc)(const AcReal&, const AcReal&);
 
 // clang-format off
@@ -41,6 +42,13 @@ dsquared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dsquare
 
 static __device__ inline AcReal
 dexp_squared_vec(const AcReal& a, const AcReal& b, const AcReal& c) { return dexp_squared(a) + dexp_squared(b) + dexp_squared(c); }
+
+static __device__ inline AcReal
+dlength_alf(const AcReal& a, const AcReal& b, const AcReal& c, const AcReal& d) { return sqrt(a*a + b*b + c*c)/sqrt(AcReal(4.0)*M_PI*exp(d)); }
+
+static __device__ inline AcReal
+dsquared_alf(const AcReal& a, const AcReal& b, const AcReal& c, const AcReal& d) { return (dsquared(a) + dsquared(b) + dsquared(c))/(AcReal(4.0)*M_PI*exp(d)); }
+
 // clang-format on
 
 #include <assert.h>
@@ -97,6 +105,35 @@ kernel_filter_vec(const __restrict__ AcReal* src0, const __restrict__ AcReal* sr
                                                                    src2[IDX(src_idx)]);
 }
 
+template <FilterFuncVecScal filter>
+static __global__ void
+kernel_filter_vec_scal(const __restrict__ AcReal* src0, const __restrict__ AcReal* src1,
+                       const __restrict__ AcReal* src2, const __restrict__ AcReal* src3,  
+                       const int3 start, const int3 end, AcReal* dst)
+{
+    const int3 src_idx = (int3){start.x + threadIdx.x + blockIdx.x * blockDim.x,
+                                start.y + threadIdx.y + blockIdx.y * blockDim.y,
+                                start.z + threadIdx.z + blockIdx.z * blockDim.z};
+
+    const int nx = end.x - start.x;
+    const int ny = end.y - start.y;
+    const int nz = end.z - start.z;
+    (void)nz; // Suppressed unused variable warning when not compiling with debug flags
+
+    const int3 dst_idx = (int3){threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z};
+
+    assert(src_idx.x < DCONST(AC_nx_max) && src_idx.y < DCONST(AC_ny_max) &&
+           src_idx.z < DCONST(AC_nz_max));
+    assert(dst_idx.x < nx && dst_idx.y < ny && dst_idx.z < nz);
+    assert(dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny < nx * ny * nz);
+
+    dst[dst_idx.x + dst_idx.y * nx + dst_idx.z * nx * ny] = filter(
+        src0[IDX(src_idx)], src1[IDX(src_idx)], src2[IDX(src_idx)], src3[IDX(src_idx)]);
+}
+
+
 template <ReduceFunc reduce>
 static __global__ void
 kernel_reduce(AcReal* scratchpad, const int num_elems)
@@ -253,3 +290,54 @@ acKernelReduceVec(const cudaStream_t stream, const ReductionType rtype, const in
     cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
     return result;
 }
+
+AcReal
+acKernelReduceVecScal(const cudaStream_t stream, const ReductionType rtype, const int3 start,
+                      const int3 end, const AcReal* vtxbuf0, const AcReal* vtxbuf1,
+                      const AcReal* vtxbuf2, const AcReal* vtxbuf3, AcReal* scratchpad, AcReal* reduce_result)
+{
+    const unsigned nx        = end.x - start.x;
+    const unsigned ny        = end.y - start.y;
+    const unsigned nz        = end.z - start.z;
+    const unsigned num_elems = nx * ny * nz;
+
+    const dim3 tpb_filter(32, 4, 1);
+    const dim3 bpg_filter((unsigned int)ceil(nx / AcReal(tpb_filter.x)),
+                          (unsigned int)ceil(ny / AcReal(tpb_filter.y)),
+                          (unsigned int)ceil(nz / AcReal(tpb_filter.z)));
+
+    const int tpb_reduce = 128;
+    const int bpg_reduce = num_elems / tpb_reduce;
+
+    ERRCHK(nx >= tpb_filter.x);
+    ERRCHK(ny >= tpb_filter.y);
+    ERRCHK(nz >= tpb_filter.z);
+    ERRCHK(tpb_reduce <= num_elems);
+    ERRCHK(nx * ny * nz % 2 == 0);
+
+    //NOTE: currently this has been made to only calculate afven speeds from the diagnostics. 
+
+    // clang-format off
+    if (rtype == RTYPE_ALFVEN_MAX) {
+        kernel_filter_vec_scal<dlength_alf><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, vtxbuf3, start, end, scratchpad);
+        kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_ALFVEN_MIN) {
+        kernel_filter_vec_scal<dlength_alf><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, vtxbuf3, start, end, scratchpad);
+        kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else if (rtype == RTYPE_ALFVEN_RMS) {
+        kernel_filter_vec_scal<dsquared_alf><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, vtxbuf3, start, end, scratchpad);
+        kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
+        kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
+    } else {
+        ERROR("Unrecognized rtype");
+    }
+    // clang-format on
+
+    cudaStreamSynchronize(stream);
+    AcReal result;
+    cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
+    return result;
+}
+