]> git.dujemihanovic.xyz Git - linux.git/commitdiff
drm/xe/guc: Handle timing out of signaled jobs gracefully
authorMatthew Brost <matthew.brost@intel.com>
Fri, 23 Feb 2024 20:46:59 +0000 (12:46 -0800)
committerMatthew Brost <matthew.brost@intel.com>
Mon, 26 Feb 2024 21:07:18 +0000 (13:07 -0800)
Timing out of signaled jobs can happen during regular operations (e.g.
an exec queue closed immediately after last fence signaled). The TDR can
pass the worker which free jobs. Rather than running through the TDR if
signaled job is found, simply free it without any debug messages.

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reported-by: José Roberto de Souza <jose.souza@intel.com>
Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1271
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Tested-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240223204659.40750-1-matthew.brost@intel.com
drivers/gpu/drm/xe/xe_guc_submit.c

index ff77bc8da1b27052debc63e06945dc55bd23f100..29748e40555fc1d3359ff5fc9ba9b39b4a6d3e8b 100644 (file)
@@ -929,20 +929,26 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
        int err = -ETIME;
        int i = 0;
 
-       if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) {
-               drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx",
-                          xe_sched_job_seqno(job), q->guc->id, q->flags);
-               xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
-                          "Kernel-submitted job timed out\n");
-               xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
-                          "VM job timed out on non-killed execqueue\n");
-
-               simple_error_capture(q);
-               xe_devcoredump(job);
-       } else {
-               drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u, guc_id=%d, flags=0x%lx",
-                        xe_sched_job_seqno(job), q->guc->id, q->flags);
+       /*
+        * TDR has fired before free job worker. Common if exec queue
+        * immediately closed after last fence signaled.
+        */
+       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) {
+               guc_exec_queue_free_job(drm_job);
+
+               return DRM_GPU_SCHED_STAT_NOMINAL;
        }
+
+       drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx",
+                  xe_sched_job_seqno(job), q->guc->id, q->flags);
+       xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
+                  "Kernel-submitted job timed out\n");
+       xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
+                  "VM job timed out on non-killed execqueue\n");
+
+       simple_error_capture(q);
+       xe_devcoredump(job);
+
        trace_xe_sched_job_timedout(job);
 
        /* Kill the run_job entry point */