From 588b9828f0744ca13555c4a35cd0251ac8ad8ad2 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Fri, 25 Oct 2019 11:51:56 +0100 Subject: drm: Don't free jobs in wait_event_interruptible() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_sched_cleanup_jobs() attempts to free finished jobs, however because it is called as the condition of wait_event_interruptible() it must not sleep. Unfortunately some free callbacks (notably for Panfrost) do sleep. Instead let's rename drm_sched_cleanup_jobs() to drm_sched_get_cleanup_job() and simply return a job for processing if there is one. The caller can then call the free_job() callback outside the wait_event_interruptible() where sleeping is possible before re-checking and returning to sleep if necessary. Tested-by: Christian Gmeiner Fixes: 5918045c4ed4 ("drm/scheduler: rework job destruction") Signed-off-by: Steven Price Reviewed-by: Christian König Signed-off-by: Christian König Link: https://patchwork.freedesktop.org/patch/337652/ --- drivers/gpu/drm/scheduler/sched_main.c | 43 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'drivers/gpu/drm/scheduler/sched_main.c') diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 9a0ee74d82dc..d4cc7289147e 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -622,43 +622,41 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb) } /** - * drm_sched_cleanup_jobs - destroy finished jobs + * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed * * @sched: scheduler instance * - * Remove all finished jobs from the mirror list and destroy them. + * Returns the next finished job from the mirror list (if there is one) + * ready for it to be destroyed. */ -static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched) +static struct drm_sched_job * +drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) { + struct drm_sched_job *job; unsigned long flags; /* Don't destroy jobs while the timeout worker is running */ if (sched->timeout != MAX_SCHEDULE_TIMEOUT && !cancel_delayed_work(&sched->work_tdr)) - return; - + return NULL; - while (!list_empty(&sched->ring_mirror_list)) { - struct drm_sched_job *job; + spin_lock_irqsave(&sched->job_list_lock, flags); - job = list_first_entry(&sched->ring_mirror_list, + job = list_first_entry_or_null(&sched->ring_mirror_list, struct drm_sched_job, node); - if (!dma_fence_is_signaled(&job->s_fence->finished)) - break; - spin_lock_irqsave(&sched->job_list_lock, flags); + if (job && dma_fence_is_signaled(&job->s_fence->finished)) { /* remove job from ring_mirror_list */ list_del_init(&job->node); - spin_unlock_irqrestore(&sched->job_list_lock, flags); - - sched->ops->free_job(job); + } else { + job = NULL; + /* queue timeout for next job */ + drm_sched_start_timeout(sched); } - /* queue timeout for next job */ - spin_lock_irqsave(&sched->job_list_lock, flags); - drm_sched_start_timeout(sched); spin_unlock_irqrestore(&sched->job_list_lock, flags); + return job; } /** @@ -698,12 +696,19 @@ static int drm_sched_main(void *param) struct drm_sched_fence *s_fence; struct drm_sched_job *sched_job; struct dma_fence *fence; + struct drm_sched_job *cleanup_job = NULL; wait_event_interruptible(sched->wake_up_worker, - (drm_sched_cleanup_jobs(sched), + (cleanup_job = drm_sched_get_cleanup_job(sched)) || (!drm_sched_blocked(sched) && (entity = drm_sched_select_entity(sched))) || - kthread_should_stop())); + kthread_should_stop()); + + if (cleanup_job) { + sched->ops->free_job(cleanup_job); + /* queue timeout for next job */ + drm_sched_start_timeout(sched); + } if (!entity) continue; -- cgit From e91e5f080e03e0c28742f0aa4051c7cefafd7cdd Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 24 Oct 2019 15:39:06 -0400 Subject: drm/sched: Set error to s_fence if HW job submission failed. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: When run_job fails and HW fence returned is NULL we still signal the s_fence to avoid hangs but the user has no way of knowing if the actual HW job was ran and finished. Fix: Allow .run_job implementations to return ERR_PTR in the fence pointer returned and then set this error for s_fence->finished fence so whoever wait on this fence can inspect the signaled fence for an error. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/scheduler/sched_main.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/scheduler/sched_main.c') diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 9a0ee74d82dc..f39b97ed4ade 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -479,6 +479,7 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) struct drm_sched_job *s_job, *tmp; uint64_t guilty_context; bool found_guilty = false; + struct dma_fence *fence; list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { struct drm_sched_fence *s_fence = s_job->s_fence; @@ -492,7 +493,16 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) dma_fence_set_error(&s_fence->finished, -ECANCELED); dma_fence_put(s_job->s_fence->parent); - s_job->s_fence->parent = sched->ops->run_job(s_job); + fence = sched->ops->run_job(s_job); + + if (IS_ERR_OR_NULL(fence)) { + s_job->s_fence->parent = NULL; + dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); + } else { + s_job->s_fence->parent = fence; + } + + } } EXPORT_SYMBOL(drm_sched_resubmit_jobs); @@ -720,7 +730,7 @@ static int drm_sched_main(void *param) fence = sched->ops->run_job(sched_job); drm_sched_fence_scheduled(s_fence); - if (fence) { + if (!IS_ERR_OR_NULL(fence)) { s_fence->parent = dma_fence_get(fence); r = dma_fence_add_callback(fence, &sched_job->cb, drm_sched_process_job); @@ -730,8 +740,11 @@ static int drm_sched_main(void *param) DRM_ERROR("fence add callback failed (%d)\n", r); dma_fence_put(fence); - } else + } else { + + dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); drm_sched_process_job(NULL, &sched_job->cb); + } wake_up(&sched->job_scheduled); } -- cgit From d7c5782acd354bdb5ed0fa10e1e397eaed558390 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 29 Oct 2019 11:03:05 -0400 Subject: drm/sched: Fix passing zero to 'PTR_ERR' warning v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a static code checker warning. v2: Drop PTR_ERR_OR_ZERO. Signed-off-by: Andrey Grodzovsky Reviewed-by: Emily Deng Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/scheduler/sched_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/scheduler/sched_main.c') diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 2af64459b3d7..fbb6446c4f69 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -496,8 +496,10 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) fence = sched->ops->run_job(s_job); if (IS_ERR_OR_NULL(fence)) { + if (IS_ERR(fence)) + dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); + s_job->s_fence->parent = NULL; - dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); } else { s_job->s_fence->parent = fence; } @@ -746,8 +748,9 @@ static int drm_sched_main(void *param) r); dma_fence_put(fence); } else { + if (IS_ERR(fence)) + dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); - dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); drm_sched_process_job(NULL, &sched_job->cb); } -- cgit From 83a7772ba223333755d8afd90ab8b2ea3f57d4e6 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 4 Nov 2019 16:30:05 -0500 Subject: drm/sched: Use completion to wait for sched->thread idle v2. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes thread park/unpark hack from drm_sched_entity_fini and by this fixes reactivation of scheduler thread while the thread is supposed to be stopped. v2: Per sched entity completion. Signed-off-by: Andrey Grodzovsky Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/scheduler/sched_entity.c | 12 ++++++++---- drivers/gpu/drm/scheduler/sched_main.c | 6 ++++++ include/drm/gpu_scheduler.h | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/scheduler/sched_main.c') diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 1a5153197fe9..461a7a8129f4 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -68,6 +69,8 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, if (!entity->rq_list) return -ENOMEM; + init_completion(&entity->entity_idle); + for (i = 0; i < num_rq_list; ++i) entity->rq_list[i] = rq_list[i]; @@ -286,11 +289,12 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) */ if (spsc_queue_count(&entity->job_queue)) { if (sched) { - /* Park the kernel for a moment to make sure it isn't processing - * our enity. + /* + * Wait for thread to idle to make sure it isn't processing + * this entity. */ - kthread_park(sched->thread); - kthread_unpark(sched->thread); + wait_for_completion(&entity->entity_idle); + } if (entity->dependency) { dma_fence_remove_callback(entity->dependency, diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index fbb6446c4f69..362cf1aa13b2 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -134,6 +135,7 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq) list_for_each_entry_continue(entity, &rq->entities, list) { if (drm_sched_entity_is_ready(entity)) { rq->current_entity = entity; + reinit_completion(&entity->entity_idle); spin_unlock(&rq->lock); return entity; } @@ -144,6 +146,7 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq) if (drm_sched_entity_is_ready(entity)) { rq->current_entity = entity; + reinit_completion(&entity->entity_idle); spin_unlock(&rq->lock); return entity; } @@ -726,6 +729,9 @@ static int drm_sched_main(void *param) continue; sched_job = drm_sched_entity_pop_job(entity); + + complete(&entity->entity_idle); + if (!sched_job) continue; diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 57b4121c750a..6619d2ac6fa3 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -71,6 +71,7 @@ enum drm_sched_priority { * @last_scheduled: points to the finished fence of the last scheduled job. * @last_user: last group leader pushing a job into the entity. * @stopped: Marks the enity as removed from rq and destined for termination. + * @entity_idle: Signals when enityt is not in use * * Entities will emit jobs in order to their corresponding hardware * ring, and the scheduler will alternate between entities based on @@ -94,6 +95,7 @@ struct drm_sched_entity { struct dma_fence *last_scheduled; struct task_struct *last_user; bool stopped; + struct completion entity_idle; }; /** -- cgit From 2b6f717c33ef40257383478479a97d9ba2da8dab Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 7 Nov 2019 17:55:15 -0500 Subject: drm/sched: Avoid job cleanup if sched thread is parked. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the sched thread is parked we assume ring_mirror_list is not accessed from here. Signed-off-by: Andrey Grodzovsky Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/scheduler/sched_main.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/scheduler/sched_main.c') diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 362cf1aa13b2..3c57e84222ca 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -650,9 +650,13 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) struct drm_sched_job *job; unsigned long flags; - /* Don't destroy jobs while the timeout worker is running */ - if (sched->timeout != MAX_SCHEDULE_TIMEOUT && - !cancel_delayed_work(&sched->work_tdr)) + /* + * Don't destroy jobs while the timeout worker is running OR thread + * is being parked and hence assumed to not touch ring_mirror_list + */ + if ((sched->timeout != MAX_SCHEDULE_TIMEOUT && + !cancel_delayed_work(&sched->work_tdr)) || + __kthread_should_park(sched->thread)) return NULL; spin_lock_irqsave(&sched->job_list_lock, flags); -- cgit