Cleaned reductions a bit

This commit is contained in:
jpekkila
2019-06-17 17:52:14 +03:00
parent 70c047ec8c
commit 361725adc6

View File

@@ -926,14 +926,13 @@ _kernel_reduce(AcReal* src, AcReal* result)
while (offset > 0) { while (offset > 0) {
if (threadIdx.x < offset) { if (threadIdx.x < offset) {
tmp = reduce(tmp, smem[threadIdx.x + offset]); smem[threadIdx.x] = reduce(smem[threadIdx.x], smem[threadIdx.x + offset]);
smem[threadIdx.x] = tmp;
} }
offset /= 2; offset /= 2;
__syncthreads(); __syncthreads();
} }
if (threadIdx.x == 0) if (threadIdx.x == 0)
src[idx] = tmp; src[idx] = smem[threadIdx.x];
} }
template <ReduceFunc reduce> template <ReduceFunc reduce>
@@ -1080,6 +1079,7 @@ reduce_vec(const cudaStream_t stream,
} }
AcReal result; AcReal result;
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost); cudaMemcpyAsync(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
return result; return result;
} }