From 4286cc2c953983d44d248c9de1c81d3a9643345c Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 13 Sep 2024 17:05:52 +0100 Subject: drm/sched: Add locking to drm_sched_entity_modify_sched MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without the locking amdgpu currently can race between amdgpu_ctx_set_entity_priority() (via drm_sched_entity_modify_sched()) and drm_sched_job_arm(), leading to the latter accesing potentially inconsitent entity->sched_list and entity->num_sched_list pair. v2: * Improve commit message. (Philipp) Signed-off-by: Tvrtko Ursulin Fixes: b37aced31eb0 ("drm/scheduler: implement a function to modify sched list") Cc: Christian König Cc: Alex Deucher Cc: Luben Tuikov Cc: Matthew Brost Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: Philipp Stanner Cc: # v5.7+ Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20240913160559.49054-2-tursulin@igalia.com Signed-off-by: Christian König --- drivers/gpu/drm/scheduler/sched_entity.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 567e5ace6d0c..0e002c17fcb6 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -133,8 +133,10 @@ void drm_sched_entity_modify_sched(struct drm_sched_entity *entity, { WARN_ON(!num_sched_list || !sched_list); + spin_lock(&entity->rq_lock); entity->sched_list = sched_list; entity->num_sched_list = num_sched_list; + spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_modify_sched); -- cgit From cbc8764e29c2318229261a679b2aafd0f9072885 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 24 Sep 2024 11:19:08 +0100 Subject: drm/sched: Always wake up correct scheduler in drm_sched_entity_push_job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since drm_sched_entity_modify_sched() can modify the entities run queue, lets make sure to only dereference the pointer once so both adding and waking up are guaranteed to be consistent. Alternative of moving the spin_unlock to after the wake up would for now be more problematic since the same lock is taken inside drm_sched_rq_update_fifo(). v2: * Improve commit message. (Philipp) * Cache the scheduler pointer directly. (Christian) Signed-off-by: Tvrtko Ursulin Fixes: b37aced31eb0 ("drm/scheduler: implement a function to modify sched list") Cc: Christian König Cc: Alex Deucher Cc: Luben Tuikov Cc: Matthew Brost Cc: David Airlie Cc: Daniel Vetter Cc: Philipp Stanner Cc: dri-devel@lists.freedesktop.org Cc: # v5.7+ Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20240924101914.2713-3-tursulin@igalia.com Signed-off-by: Christian König --- drivers/gpu/drm/scheduler/sched_entity.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 0e002c17fcb6..a75eede8bf8d 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -599,6 +599,9 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) /* first job wakes up scheduler */ if (first) { + struct drm_gpu_scheduler *sched; + struct drm_sched_rq *rq; + /* Add the entity to the run queue */ spin_lock(&entity->rq_lock); if (entity->stopped) { @@ -608,13 +611,16 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) return; } - drm_sched_rq_add_entity(entity->rq, entity); + rq = entity->rq; + sched = rq->sched; + + drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) drm_sched_rq_update_fifo(entity, submit_ts); - drm_sched_wakeup(entity->rq->sched); + drm_sched_wakeup(sched); } } EXPORT_SYMBOL(drm_sched_entity_push_job); -- cgit From 087913e0ba2b3b9d7ccbafb2acf5dab9e35ae1d5 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 24 Sep 2024 11:19:09 +0100 Subject: drm/sched: Always increment correct scheduler score MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Entities run queue can change during drm_sched_entity_push_job() so make sure to update the score consistently. Signed-off-by: Tvrtko Ursulin Fixes: d41a39dda140 ("drm/scheduler: improve job distribution with multiple queues") Cc: Nirmoy Das Cc: Christian König Cc: Luben Tuikov Cc: Matthew Brost Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: # v5.9+ Reviewed-by: Christian König Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240924101914.2713-4-tursulin@igalia.com Signed-off-by: Christian König --- drivers/gpu/drm/scheduler/sched_entity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index a75eede8bf8d..b2cf3e0c1838 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -586,7 +586,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) ktime_t submit_ts; trace_drm_sched_job(sched_job, entity); - atomic_inc(entity->rq->sched->score); WRITE_ONCE(entity->last_user, current->group_leader); /* @@ -614,6 +613,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) rq = entity->rq; sched = rq->sched; + atomic_inc(sched->score); drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); -- cgit From abf201f6ce14c4ceeccde5471bdf59614b83a3d8 Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 30 Sep 2024 15:07:49 +0200 Subject: drm/sched: revert "Always increment correct scheduler score" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 087913e0ba2b3b9d7ccbafb2acf5dab9e35ae1d5. It turned out that the original code was correct since the rq can only change when there is no armed job for an entity. This change here broke the logic since we only incremented the counter for the first job, so revert it. Signed-off-by: Christian König Acked-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20240930131451.536150-1-christian.koenig@amd.com --- drivers/gpu/drm/scheduler/sched_entity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index b2cf3e0c1838..a75eede8bf8d 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -586,6 +586,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) ktime_t submit_ts; trace_drm_sched_job(sched_job, entity); + atomic_inc(entity->rq->sched->score); WRITE_ONCE(entity->last_user, current->group_leader); /* @@ -613,7 +614,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) rq = entity->rq; sched = rq->sched; - atomic_inc(sched->score); drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); -- cgit From 2b55639a4e25ff02ee496368b03456bd28ebdc0b Mon Sep 17 00:00:00 2001 From: Liviu Dudau Date: Fri, 20 Sep 2024 11:28:02 +0100 Subject: drm/panthor: Add FOP_UNSIGNED_OFFSET to fop_flags Since commit 641bb4394f40 ("fs: move FMODE_UNSIGNED_OFFSET to fop_flags") the FMODE_UNSIGNED_OFFSET flag has been moved to fop_flags and renamed, but the patch failed to make the changes for the panthor driver. When user space opens the render node the WARN() added by the patch gets triggered. Fixes: 641bb4394f40 ("fs: move FMODE_UNSIGNED_OFFSET to fop_flags") Cc: Christian Brauner Signed-off-by: Liviu Dudau Reviewed-by: Boris Brezillon Reviewed-by: Steven Price Reviewed-by: Christian Brauner Tested-by: Heiko Stuebner Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20240920102802.2483367-1-liviu.dudau@arm.com --- drivers/gpu/drm/panthor/panthor_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index 34182f67136c..c520f156e2d7 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -1383,6 +1383,7 @@ static const struct file_operations panthor_drm_driver_fops = { .read = drm_read, .llseek = noop_llseek, .mmap = panthor_mmap, + .fop_flags = FOP_UNSIGNED_OFFSET, }; #ifdef CONFIG_DEBUG_FS -- cgit From fa998a9eac8809da4f219aad49836fcad2a9bf5c Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 13 Sep 2024 13:27:22 +0200 Subject: drm/panthor: Lock the VM resv before calling drm_gpuvm_bo_obtain_prealloc() drm_gpuvm_bo_obtain_prealloc() will call drm_gpuvm_bo_put() on our pre-allocated BO if the association exists. Given we only have one ref on preallocated_vm_bo, drm_gpuvm_bo_destroy() will be called immediately, and we have to hold the VM resv lock when calling this function. Fixes: 647810ec2476 ("drm/panthor: Add the MMU/VM logical block") Signed-off-by: Boris Brezillon Reviewed-by: Liviu Dudau Reviewed-by: Steven Price Link: https://patchwork.freedesktop.org/patch/msgid/20240913112722.492144-1-boris.brezillon@collabora.com --- drivers/gpu/drm/panthor/panthor_mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c index bbc12728437f..3cd2bce59edc 100644 --- a/drivers/gpu/drm/panthor/panthor_mmu.c +++ b/drivers/gpu/drm/panthor/panthor_mmu.c @@ -1251,9 +1251,17 @@ static int panthor_vm_prepare_map_op_ctx(struct panthor_vm_op_ctx *op_ctx, goto err_cleanup; } + /* drm_gpuvm_bo_obtain_prealloc() will call drm_gpuvm_bo_put() on our + * pre-allocated BO if the association exists. Given we + * only have one ref on preallocated_vm_bo, drm_gpuvm_bo_destroy() will + * be called immediately, and we have to hold the VM resv lock when + * calling this function. + */ + dma_resv_lock(panthor_vm_resv(vm), NULL); mutex_lock(&bo->gpuva_list_lock); op_ctx->map.vm_bo = drm_gpuvm_bo_obtain_prealloc(preallocated_vm_bo); mutex_unlock(&bo->gpuva_list_lock); + dma_resv_unlock(panthor_vm_resv(vm)); /* If the a vm_bo for this combination exists, it already * retains a pin ref, and we can release the one we took earlier. -- cgit From 282864cc5d3f144af0cdea1868ee2dc2c5110f0d Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 30 Sep 2024 18:37:42 +0200 Subject: drm/panthor: Fix access to uninitialized variable in tick_ctx_cleanup() The group variable can't be used to retrieve ptdev in our second loop, because it points to the previously iterated list_head, not a valid group. Get the ptdev object from the scheduler instead. Cc: Fixes: d72f049087d4 ("drm/panthor: Allow driver compilation") Reported-by: kernel test robot Reported-by: Julia Lawall Closes: https://lore.kernel.org/r/202409302306.UDikqa03-lkp@intel.com/ Signed-off-by: Boris Brezillon Reviewed-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20240930163742.87036-1-boris.brezillon@collabora.com --- drivers/gpu/drm/panthor/panthor_sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index a8a939a9fb51..145d983bb129 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -2046,6 +2046,7 @@ static void tick_ctx_cleanup(struct panthor_scheduler *sched, struct panthor_sched_tick_ctx *ctx) { + struct panthor_device *ptdev = sched->ptdev; struct panthor_group *group, *tmp; u32 i; @@ -2054,7 +2055,7 @@ tick_ctx_cleanup(struct panthor_scheduler *sched, /* If everything went fine, we should only have groups * to be terminated in the old_groups lists. */ - drm_WARN_ON(&group->ptdev->base, !ctx->csg_upd_failed_mask && + drm_WARN_ON(&ptdev->base, !ctx->csg_upd_failed_mask && group_can_run(group)); if (!group_can_run(group)) { @@ -2077,7 +2078,7 @@ tick_ctx_cleanup(struct panthor_scheduler *sched, /* If everything went fine, the groups to schedule lists should * be empty. */ - drm_WARN_ON(&group->ptdev->base, + drm_WARN_ON(&ptdev->base, !ctx->csg_upd_failed_mask && !list_empty(&ctx->groups[i])); list_for_each_entry_safe(group, tmp, &ctx->groups[i], run_node) { -- cgit From 7a1f30afe97294281a2ba05977688385744f9844 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 5 Sep 2024 09:19:14 +0200 Subject: drm/panthor: Don't declare a queue blocked if deferred operations are pending If deferred operations are pending, we want to wait for those to land before declaring the queue blocked on a SYNC_WAIT. We need this to deal with the case where the sync object is signalled through a deferred SYNC_{ADD,SET} from the same queue. If we don't do that and the group gets scheduled out before the deferred SYNC_{SET,ADD} is executed, we'll end up with a timeout, because no external SYNC_{SET,ADD} will make the scheduler reconsider the group for execution. Fixes: de8548813824 ("drm/panthor: Add the scheduler logical block") Cc: Signed-off-by: Boris Brezillon Reviewed-by: Steven Price Reviewed-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20240905071914.3278599-1-boris.brezillon@collabora.com --- drivers/gpu/drm/panthor/panthor_sched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 145d983bb129..2aff02ba6949 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -1103,7 +1103,13 @@ cs_slot_sync_queue_state_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs list_move_tail(&group->wait_node, &group->ptdev->scheduler->groups.waiting); } - group->blocked_queues |= BIT(cs_id); + + /* The queue is only blocked if there's no deferred operation + * pending, which can be checked through the scoreboard status. + */ + if (!cs_iface->output->status_scoreboards) + group->blocked_queues |= BIT(cs_id); + queue->syncwait.gpu_va = cs_iface->output->status_wait_sync_ptr; queue->syncwait.ref = cs_iface->output->status_wait_sync_value; status_wait_cond = cs_iface->output->status_wait & CS_STATUS_WAIT_SYNC_COND_MASK; -- cgit From f9e7ac6e2e9986c2ee63224992cb5c8276e46b2a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 5 Sep 2024 09:01:54 +0200 Subject: drm/panthor: Don't add write fences to the shared BOs The only user (the mesa gallium driver) is already assuming explicit synchronization and doing the export/import dance on shared BOs. The only reason we were registering ourselves as writers on external BOs is because Xe, which was the reference back when we developed Panthor, was doing so. Turns out Xe was wrong, and we really want bookkeep on all registered fences, so userspace can explicitly upgrade those to read/write when needed. Fixes: 4bdca1150792 ("drm/panthor: Add the driver frontend block") Cc: Matthew Brost Cc: Simona Vetter Cc: Signed-off-by: Boris Brezillon Reviewed-by: Steven Price Reviewed-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20240905070155.3254011-1-boris.brezillon@collabora.com --- drivers/gpu/drm/panthor/panthor_sched.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 2aff02ba6949..aee362abb710 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -3443,13 +3443,8 @@ void panthor_job_update_resvs(struct drm_exec *exec, struct drm_sched_job *sched { struct panthor_job *job = container_of(sched_job, struct panthor_job, base); - /* Still not sure why we want USAGE_WRITE for external objects, since I - * was assuming this would be handled through explicit syncs being imported - * to external BOs with DMA_BUF_IOCTL_IMPORT_SYNC_FILE, but other drivers - * seem to pass DMA_RESV_USAGE_WRITE, so there must be a good reason. - */ panthor_vm_update_resvs(job->group->vm, exec, &sched_job->s_fence->finished, - DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_WRITE); + DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP); } void panthor_sched_unplug(struct panthor_device *ptdev) -- cgit