Added sum reduction. NOTE: Scalar sum does not pass the automated test but vector sum does. I couldn't see anything wrong with the code itself and I strongly suspect that the failures are caused by loss of precision due to summing a huge amount of numbers of different magnitudes. However I'm not yet completely sure. Something like the Kahan summation algorithm might be useful if the errors are really caused by fp arithmetic.
This commit is contained in:
@@ -474,7 +474,7 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
|
||||
else if (rtype == RTYPE_MIN) {
|
||||
res = min(res, results[i]);
|
||||
}
|
||||
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP) {
|
||||
else if (rtype == RTYPE_RMS || rtype == RTYPE_RMS_EXP || rtype == RTYPE_SUM) {
|
||||
res = sum(res, results[i]);
|
||||
}
|
||||
else {
|
||||
@@ -486,7 +486,6 @@ simple_final_reduce_scal(const ReductionType& rtype, const AcReal* results, cons
|
||||
const AcReal inv_n = AcReal(1.) / (grid.n.x * grid.n.y * grid.n.z);
|
||||
res = sqrt(inv_n * res);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
@@ -893,6 +893,10 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st
|
||||
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
@@ -944,6 +948,10 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta
|
||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else if (rtype == RTYPE_SUM) {
|
||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||
} else {
|
||||
ERROR("Unrecognized rtype");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user