From 25f2ff53838ccbd5ce558b5d23fac8a5d7f86655 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 5 Sep 2024 17:00:50 +0200 Subject: drm/xe: Remove runtime argument from display s/r functions The previous change ensures that pm_suspend is only called when suspending or resuming. This ensures no further bugs like those in the previous commit. Signed-off-by: Maarten Lankhorst Reviewed-by: Lucas De Marchi Reviewed-by: Vinod Govindapillai Link: https://patchwork.freedesktop.org/patch/msgid/20240905150052.174895-3-maarten.lankhorst@linux.intel.com (cherry picked from commit f90491d4b64e302e940133103d3d9908e70e454f) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/xe_display.c | 53 ++++++++++++++++++++------------- drivers/gpu/drm/xe/display/xe_display.h | 8 ++--- drivers/gpu/drm/xe/xe_pm.c | 6 ++-- 3 files changed, 39 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index 75736faf2a80..1c25c4f6a53b 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -309,18 +309,7 @@ static void xe_display_flush_cleanup_work(struct xe_device *xe) } /* TODO: System and runtime suspend/resume sequences will be sanitized as a follow-up. */ -void xe_display_pm_runtime_suspend(struct xe_device *xe) -{ - if (!xe->info.probe_display) - return; - - if (xe->d3cold.allowed) - xe_display_pm_suspend(xe, true); - - intel_hpd_poll_enable(xe); -} - -void xe_display_pm_suspend(struct xe_device *xe, bool runtime) +static void __xe_display_pm_suspend(struct xe_device *xe, bool runtime) { struct intel_display *display = &xe->display; bool s2idle = suspend_to_idle(); @@ -355,26 +344,31 @@ void xe_display_pm_suspend(struct xe_device *xe, bool runtime) intel_dmc_suspend(xe); } -void xe_display_pm_suspend_late(struct xe_device *xe) +void xe_display_pm_suspend(struct xe_device *xe) +{ + __xe_display_pm_suspend(xe, false); +} + +void xe_display_pm_runtime_suspend(struct xe_device *xe) { - bool s2idle = suspend_to_idle(); if (!xe->info.probe_display) return; - intel_power_domains_suspend(xe, s2idle); + if (xe->d3cold.allowed) + __xe_display_pm_suspend(xe, true); - intel_display_power_suspend_late(xe); + intel_hpd_poll_enable(xe); } -void xe_display_pm_runtime_resume(struct xe_device *xe) +void xe_display_pm_suspend_late(struct xe_device *xe) { + bool s2idle = suspend_to_idle(); if (!xe->info.probe_display) return; - intel_hpd_poll_disable(xe); + intel_power_domains_suspend(xe, s2idle); - if (xe->d3cold.allowed) - xe_display_pm_resume(xe, true); + intel_display_power_suspend_late(xe); } void xe_display_pm_resume_early(struct xe_device *xe) @@ -387,7 +381,7 @@ void xe_display_pm_resume_early(struct xe_device *xe) intel_power_domains_resume(xe); } -void xe_display_pm_resume(struct xe_device *xe, bool runtime) +static void __xe_display_pm_resume(struct xe_device *xe, bool runtime) { struct intel_display *display = &xe->display; @@ -421,6 +415,23 @@ void xe_display_pm_resume(struct xe_device *xe, bool runtime) intel_power_domains_enable(xe); } +void xe_display_pm_resume(struct xe_device *xe) +{ + __xe_display_pm_resume(xe, false); +} + +void xe_display_pm_runtime_resume(struct xe_device *xe) +{ + if (!xe->info.probe_display) + return; + + intel_hpd_poll_disable(xe); + + if (xe->d3cold.allowed) + __xe_display_pm_resume(xe, true); +} + + static void display_device_remove(struct drm_device *dev, void *arg) { struct xe_device *xe = arg; diff --git a/drivers/gpu/drm/xe/display/xe_display.h b/drivers/gpu/drm/xe/display/xe_display.h index 53d727fd792b..bed55fd26f30 100644 --- a/drivers/gpu/drm/xe/display/xe_display.h +++ b/drivers/gpu/drm/xe/display/xe_display.h @@ -34,10 +34,10 @@ void xe_display_irq_enable(struct xe_device *xe, u32 gu_misc_iir); void xe_display_irq_reset(struct xe_device *xe); void xe_display_irq_postinstall(struct xe_device *xe, struct xe_gt *gt); -void xe_display_pm_suspend(struct xe_device *xe, bool runtime); +void xe_display_pm_suspend(struct xe_device *xe); void xe_display_pm_suspend_late(struct xe_device *xe); void xe_display_pm_resume_early(struct xe_device *xe); -void xe_display_pm_resume(struct xe_device *xe, bool runtime); +void xe_display_pm_resume(struct xe_device *xe); void xe_display_pm_runtime_suspend(struct xe_device *xe); void xe_display_pm_runtime_resume(struct xe_device *xe); @@ -65,10 +65,10 @@ static inline void xe_display_irq_enable(struct xe_device *xe, u32 gu_misc_iir) static inline void xe_display_irq_reset(struct xe_device *xe) {} static inline void xe_display_irq_postinstall(struct xe_device *xe, struct xe_gt *gt) {} -static inline void xe_display_pm_suspend(struct xe_device *xe, bool runtime) {} +static inline void xe_display_pm_suspend(struct xe_device *xe) {} static inline void xe_display_pm_suspend_late(struct xe_device *xe) {} static inline void xe_display_pm_resume_early(struct xe_device *xe) {} -static inline void xe_display_pm_resume(struct xe_device *xe, bool runtime) {} +static inline void xe_display_pm_resume(struct xe_device *xe) {} static inline void xe_display_pm_runtime_suspend(struct xe_device *xe) {} static inline void xe_display_pm_runtime_resume(struct xe_device *xe) {} diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index 7cf2160fe040..33eb039053e4 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -123,7 +123,7 @@ int xe_pm_suspend(struct xe_device *xe) for_each_gt(gt, xe, id) xe_gt_suspend_prepare(gt); - xe_display_pm_suspend(xe, false); + xe_display_pm_suspend(xe); /* FIXME: Super racey... */ err = xe_bo_evict_all(xe); @@ -133,7 +133,7 @@ int xe_pm_suspend(struct xe_device *xe) for_each_gt(gt, xe, id) { err = xe_gt_suspend(gt); if (err) { - xe_display_pm_resume(xe, false); + xe_display_pm_resume(xe); goto err; } } @@ -187,7 +187,7 @@ int xe_pm_resume(struct xe_device *xe) for_each_gt(gt, xe, id) xe_gt_resume(gt); - xe_display_pm_resume(xe, false); + xe_display_pm_resume(xe); err = xe_bo_restore_user(xe); if (err) -- cgit From dcb6c1d071712186c213c26b245779f7859b9cec Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 9 Oct 2024 22:43:57 +0300 Subject: drm/xe/display: Separate the d3cold and non-d3cold runtime PM handling For clarity separate the d3cold and non-d3cold runtime PM handling. The only change in behavior is disabling polling later during runtime resume. This shouldn't make a difference, since the poll disabling is handled from a work, which could run at any point wrt. the runtime resume handler. The work will also require a runtime PM reference, syncing it with the resume handler. Cc: Rodrigo Vivi Reviewed-by: Jonathan Cavitt Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20241009194358.1321200-4-imre.deak@intel.com (cherry picked from commit a4de6beb83fc5adee788518350247c629568901e) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/xe_display.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index 1c25c4f6a53b..fa44fe802858 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -342,6 +342,9 @@ static void __xe_display_pm_suspend(struct xe_device *xe, bool runtime) intel_opregion_suspend(display, s2idle ? PCI_D1 : PCI_D3cold); intel_dmc_suspend(xe); + + if (runtime && has_display(xe)) + intel_hpd_poll_enable(xe); } void xe_display_pm_suspend(struct xe_device *xe) @@ -354,8 +357,10 @@ void xe_display_pm_runtime_suspend(struct xe_device *xe) if (!xe->info.probe_display) return; - if (xe->d3cold.allowed) + if (xe->d3cold.allowed) { __xe_display_pm_suspend(xe, true); + return; + } intel_hpd_poll_enable(xe); } @@ -405,9 +410,11 @@ static void __xe_display_pm_resume(struct xe_device *xe, bool runtime) intel_display_driver_resume(xe); drm_kms_helper_poll_enable(&xe->drm); intel_display_driver_enable_user_access(xe); - intel_hpd_poll_disable(xe); } + if (has_display(xe)) + intel_hpd_poll_disable(xe); + intel_opregion_resume(display); intel_fbdev_set_suspend(&xe->drm, FBINFO_STATE_RUNNING, false); @@ -425,10 +432,12 @@ void xe_display_pm_runtime_resume(struct xe_device *xe) if (!xe->info.probe_display) return; - intel_hpd_poll_disable(xe); - - if (xe->d3cold.allowed) + if (xe->d3cold.allowed) { __xe_display_pm_resume(xe, true); + return; + } + + intel_hpd_poll_disable(xe); } -- cgit From 6a9d2e2988fa3ef9b03ddd9ba9aaa54dc23635e6 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 9 Oct 2024 22:43:58 +0300 Subject: drm/xe/display: Add missing HPD interrupt enabling during non-d3cold RPM resume Atm the display HPD interrupts that got disabled during runtime suspend, are re-enabled only if d3cold is enabled. Fix things by also re-enabling the interrupts if d3cold is disabled. Cc: Rodrigo Vivi Reviewed-by: Jonathan Cavitt Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20241009194358.1321200-5-imre.deak@intel.com (cherry picked from commit bbc4a30de095f0349d3c278500345a1b620d495e) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/display/xe_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/display/xe_display.c b/drivers/gpu/drm/xe/display/xe_display.c index fa44fe802858..c6e0c8d77a70 100644 --- a/drivers/gpu/drm/xe/display/xe_display.c +++ b/drivers/gpu/drm/xe/display/xe_display.c @@ -437,6 +437,7 @@ void xe_display_pm_runtime_resume(struct xe_device *xe) return; } + intel_hpd_init(xe); intel_hpd_poll_disable(xe); } -- cgit From 993ca0eccec65a2cacc3cefb15d35ffadc6f00fb Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 23 Oct 2024 15:12:00 -0700 Subject: drm/xe: Add mmio read before GGTT invalidate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On LNL without a mmio read before a GGTT invalidate the GuC can incorrectly read the GGTT scratch page upon next access leading to jobs not getting scheduled. A mmio read before a GGTT invalidate seems to fix this. Since a GGTT invalidate is not a hot code path, blindly do a mmio read before each GGTT invalidate. Cc: John Harrison Cc: Daniele Ceraolo Spurio Cc: Thomas Hellström Cc: Lucas De Marchi Cc: stable@vger.kernel.org Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Reported-by: Paulo Zanoni Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3164 Signed-off-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20241023221200.1797832-1-matthew.brost@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 5a710196883e0ac019ac6df2a6d79c16ad3c32fa) [ Fix conflict with mmio vs gt argument ] Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_ggtt.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c index 2895f154654c..ff19eca5d358 100644 --- a/drivers/gpu/drm/xe/xe_ggtt.c +++ b/drivers/gpu/drm/xe/xe_ggtt.c @@ -397,6 +397,16 @@ static void ggtt_invalidate_gt_tlb(struct xe_gt *gt) static void xe_ggtt_invalidate(struct xe_ggtt *ggtt) { + struct xe_device *xe = tile_to_xe(ggtt->tile); + + /* + * XXX: Barrier for GGTT pages. Unsure exactly why this required but + * without this LNL is having issues with the GuC reading scratch page + * vs. correct GGTT page. Not particularly a hot code path so blindly + * do a mmio read here which results in GuC reading correct GGTT page. + */ + xe_mmio_read32(xe_root_mmio_gt(xe), VF_CAP_REG); + /* Each GT in a tile has its own TLB to cache GGTT lookups */ ggtt_invalidate_gt_tlb(ggtt->tile->primary_gt); ggtt_invalidate_gt_tlb(ggtt->tile->media_gt); -- cgit From fe05cee4d9533892210e1ee90147175d87e7c053 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 25 Oct 2024 14:43:29 -0700 Subject: drm/xe: Don't short circuit TDR on jobs not started Short circuiting TDR on jobs not started is an optimization which is not required. On LNL we are facing an issue where jobs do not get scheduled by the GuC if it misses a GGTT page update. When this occurs let the TDR fire, toggle the scheduling which may get the job unstuck, and print a warning message. If the TDR fires twice on job that hasn't started, timeout the job. v2: - Add warning message (Paulo) - Add fixes tag (Paulo) - Timeout job which hasn't started after TDR firing twice v3: - Include local change v4: - Short circuit check_timeout on job not started - use warn level rather than notice (Paulo) Fixes: 7ddb9403dd74 ("drm/xe: Sample ctx timestamp to determine if jobs have timed out") Cc: stable@vger.kernel.org Cc: Paulo Zanoni Signed-off-by: Matthew Brost Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20241025214330.2010521-2-matthew.brost@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 35d25a4a0012e690ef0cc4c5440231176db595cc) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_guc_submit.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index d333be9c4227..f903b0772722 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -916,12 +916,22 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job) { struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q)); - u32 ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]); - u32 ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]); + u32 ctx_timestamp, ctx_job_timestamp; u32 timeout_ms = q->sched_props.job_timeout_ms; u32 diff; u64 running_time_ms; + if (!xe_sched_job_started(job)) { + xe_gt_warn(gt, "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, not started", + xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), + q->guc->id); + + return xe_sched_invalidate_job(job, 2); + } + + ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]); + ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]); + /* * Counter wraps at ~223s at the usual 19.2MHz, be paranoid catch * possible overflows with a high timeout. @@ -1049,10 +1059,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) exec_queue_killed_or_banned_or_wedged(q) || exec_queue_destroyed(q); - /* Job hasn't started, can't be timed out */ - if (!skip_timeout_check && !xe_sched_job_started(job)) - goto rearm; - /* * XXX: Sampling timeout doesn't work in wedged mode as we have to * modify scheduling state to read timestamp. We could read the -- cgit