Disabled automated formatting in some parts where overfull lines are easier to read
This commit is contained in:
@@ -946,41 +946,27 @@ reduce_scal(const cudaStream_t stream, const ReductionType rtype, const int3& st
|
|||||||
ERRCHK(tpb_reduce <= num_elems);
|
ERRCHK(tpb_reduce <= num_elems);
|
||||||
ERRCHK(nx * ny * nz % 2 == 0);
|
ERRCHK(nx * ny * nz % 2 == 0);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
if (rtype == RTYPE_MAX) {
|
if (rtype == RTYPE_MAX) {
|
||||||
kernel_filter<dvalue>
|
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
scratchpad, num_elems);
|
} else if (rtype == RTYPE_MIN) {
|
||||||
kernel_reduce_block<dmax>
|
kernel_filter<dvalue><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
}
|
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
else if (rtype == RTYPE_MIN) {
|
} else if (rtype == RTYPE_RMS) {
|
||||||
kernel_filter<dvalue>
|
kernel_filter<dsquared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
scratchpad, num_elems);
|
} else if (rtype == RTYPE_RMS_EXP) {
|
||||||
kernel_reduce_block<dmin>
|
kernel_filter<dexp_squared><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
||||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
}
|
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
else if (rtype == RTYPE_RMS) {
|
} else {
|
||||||
kernel_filter<dsquared>
|
|
||||||
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
|
||||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
|
||||||
scratchpad, num_elems);
|
|
||||||
kernel_reduce_block<dsum>
|
|
||||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
|
||||||
}
|
|
||||||
else if (rtype == RTYPE_RMS_EXP) {
|
|
||||||
kernel_filter<dexp_squared>
|
|
||||||
<<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf, start, end, scratchpad);
|
|
||||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
|
||||||
scratchpad, num_elems);
|
|
||||||
kernel_reduce_block<dsum>
|
|
||||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
ERROR("Unrecognized rtype");
|
ERROR("Unrecognized rtype");
|
||||||
}
|
}
|
||||||
|
// clang-format on
|
||||||
AcReal result;
|
AcReal result;
|
||||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||||
return result;
|
return result;
|
||||||
@@ -1010,41 +996,27 @@ reduce_vec(const cudaStream_t stream, const ReductionType rtype, const int3& sta
|
|||||||
ERRCHK(tpb_reduce <= num_elems);
|
ERRCHK(tpb_reduce <= num_elems);
|
||||||
ERRCHK(nx * ny * nz % 2 == 0);
|
ERRCHK(nx * ny * nz % 2 == 0);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
if (rtype == RTYPE_MAX) {
|
if (rtype == RTYPE_MAX) {
|
||||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
kernel_reduce<dmax><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
kernel_reduce_block<dmax><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
scratchpad, num_elems);
|
} else if (rtype == RTYPE_MIN) {
|
||||||
kernel_reduce_block<dmax>
|
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
}
|
kernel_reduce_block<dmin><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
else if (rtype == RTYPE_MIN) {
|
} else if (rtype == RTYPE_RMS) {
|
||||||
kernel_filter_vec<dlength_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
kernel_reduce<dmin><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
scratchpad, num_elems);
|
} else if (rtype == RTYPE_RMS_EXP) {
|
||||||
kernel_reduce_block<dmin>
|
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
||||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(scratchpad, num_elems);
|
||||||
}
|
kernel_reduce_block<dsum><<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
||||||
else if (rtype == RTYPE_RMS) {
|
} else {
|
||||||
kernel_filter_vec<dsquared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
|
||||||
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
|
||||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
|
||||||
scratchpad, num_elems);
|
|
||||||
kernel_reduce_block<dsum>
|
|
||||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
|
||||||
}
|
|
||||||
else if (rtype == RTYPE_RMS_EXP) {
|
|
||||||
kernel_filter_vec<dexp_squared_vec><<<bpg_filter, tpb_filter, 0, stream>>>(
|
|
||||||
vtxbuf0, vtxbuf1, vtxbuf2, start, end, scratchpad);
|
|
||||||
kernel_reduce<dsum><<<bpg_reduce, tpb_reduce, sizeof(AcReal) * tpb_reduce, stream>>>(
|
|
||||||
scratchpad, num_elems);
|
|
||||||
kernel_reduce_block<dsum>
|
|
||||||
<<<1, 1, 0, stream>>>(scratchpad, bpg_reduce, tpb_reduce, reduce_result);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
ERROR("Unrecognized rtype");
|
ERROR("Unrecognized rtype");
|
||||||
}
|
}
|
||||||
|
// clang-format on
|
||||||
AcReal result;
|
AcReal result;
|
||||||
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
cudaMemcpy(&result, reduce_result, sizeof(AcReal), cudaMemcpyDeviceToHost);
|
||||||
return result;
|
return result;
|
||||||
|
Reference in New Issue
Block a user