The previous attempt (dsl_feature_completeness_2019-08-23) to enable arbitrary kernel functions was a failure: we get significant performance loss (25-100%) if step_number is not passed as a template parameter to the integration kernel. Apparently the CUDA compiler cannot perform some optimizations if there is a if/else construct in a performance-critical part which cannot be evaluated at compile time. This branch keeps step_number as a template parameter but takes rest of the user parameters as uniforms (dt is no longer passed as a function parameter but as an uniform with the DSL instead).
This commit is contained in:
@@ -303,8 +303,9 @@ acDeviceAutoOptimize(const Device device)
|
||||
|
||||
cudaEventRecord(tstart); // ---------------------------------------- Timing start
|
||||
|
||||
acDeviceLoadScalarConstant(device, STREAM_DEFAULT, AC_dt, FLT_EPSILON);
|
||||
for (int i = 0; i < num_iterations; ++i)
|
||||
solve<2><<<bpg, tpb>>>(start, end, device->vba, FLT_EPSILON);
|
||||
solve<2><<<bpg, tpb>>>(start, end, device->vba);
|
||||
|
||||
cudaEventRecord(tstop); // ----------------------------------------- Timing end
|
||||
cudaEventSynchronize(tstop);
|
||||
@@ -600,12 +601,13 @@ acDeviceIntegrateSubstep(const Device device, const Stream stream, const int ste
|
||||
(unsigned int)ceil(n.y / AcReal(tpb.y)), //
|
||||
(unsigned int)ceil(n.z / AcReal(tpb.z)));
|
||||
|
||||
acDeviceLoadScalarConstant(device, stream, AC_dt, dt);
|
||||
if (step_number == 0)
|
||||
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
||||
solve<0><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||
else if (step_number == 1)
|
||||
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
||||
solve<1><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||
else
|
||||
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba, dt);
|
||||
solve<2><<<bpg, tpb, 0, device->streams[stream]>>>(start, end, device->vba);
|
||||
|
||||
ERRCHK_CUDA_KERNEL();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user