diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 607 |
1 files changed, 338 insertions, 269 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index e104d7ef3c3d..1bbd39b3b0fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -64,11 +64,51 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, return 0; } +static int amdgpu_cs_job_idx(struct amdgpu_cs_parser *p, + struct drm_amdgpu_cs_chunk_ib *chunk_ib) +{ + struct drm_sched_entity *entity; + unsigned int i; + int r; + + r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type, + chunk_ib->ip_instance, + chunk_ib->ring, &entity); + if (r) + return r; + + /* + * Abort if there is no run queue associated with this entity. + * Possibly because of disabled HW IP. + */ + if (entity->rq == NULL) + return -EINVAL; + + /* Check if we can add this IB to some existing job */ + for (i = 0; i < p->gang_size; ++i) + if (p->entities[i] == entity) + return i; + + /* If not increase the gang size if possible */ + if (i == AMDGPU_CS_GANG_SIZE) + return -EINVAL; + + p->entities[i] = entity; + p->gang_size = i + 1; + return i; +} + static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p, struct drm_amdgpu_cs_chunk_ib *chunk_ib, unsigned int *num_ibs) { - ++(*num_ibs); + int r; + + r = amdgpu_cs_job_idx(p, chunk_ib); + if (r < 0) + return r; + + ++(num_ibs[r]); return 0; } @@ -142,11 +182,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs) { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; + unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { }; struct amdgpu_vm *vm = &fpriv->vm; uint64_t *chunk_array_user; uint64_t *chunk_array; - unsigned size, num_ibs = 0; uint32_t uf_offset = 0; + unsigned int size; int ret; int i; @@ -209,7 +250,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, if (size < sizeof(struct drm_amdgpu_cs_chunk_ib)) goto free_partial_kdata; - ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, &num_ibs); + ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs); if (ret) goto free_partial_kdata; break; @@ -246,17 +287,28 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, } } - ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm); - if (ret) - goto free_all_kdata; + if (!p->gang_size) + return -EINVAL; + + for (i = 0; i < p->gang_size; ++i) { + ret = amdgpu_job_alloc(p->adev, num_ibs[i], &p->jobs[i], vm); + if (ret) + goto free_all_kdata; + + ret = drm_sched_job_init(&p->jobs[i]->base, p->entities[i], + &fpriv->vm); + if (ret) + goto free_all_kdata; + } + p->gang_leader = p->jobs[p->gang_size - 1]; - if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) { + if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) { ret = -ECANCELED; goto free_all_kdata; } if (p->uf_entry.tv.bo) - p->job->uf_addr = uf_offset; + p->gang_leader->uf_addr = uf_offset; kvfree(chunk_array); /* Use this opportunity to fill in task info for the vm */ @@ -278,93 +330,69 @@ free_chunk: return ret; } -static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, - struct amdgpu_cs_parser *parser) +static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, + struct amdgpu_cs_chunk *chunk, + unsigned int *ce_preempt, + unsigned int *de_preempt) { - struct amdgpu_fpriv *fpriv = parser->filp->driver_priv; + struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata; + struct amdgpu_fpriv *fpriv = p->filp->driver_priv; struct amdgpu_vm *vm = &fpriv->vm; - int r, ce_preempt = 0, de_preempt = 0; struct amdgpu_ring *ring; - int i, j; - - for (i = 0, j = 0; i < parser->nchunks && j < parser->job->num_ibs; i++) { - struct amdgpu_cs_chunk *chunk; - struct amdgpu_ib *ib; - struct drm_amdgpu_cs_chunk_ib *chunk_ib; - struct drm_sched_entity *entity; - - chunk = &parser->chunks[i]; - ib = &parser->job->ibs[j]; - chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata; - - if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB) - continue; - - if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX && - chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) { - if (chunk_ib->flags & AMDGPU_IB_FLAG_CE) - ce_preempt++; - else - de_preempt++; + struct amdgpu_job *job; + struct amdgpu_ib *ib; + int r; - /* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */ - if (ce_preempt > 1 || de_preempt > 1) - return -EINVAL; - } + r = amdgpu_cs_job_idx(p, chunk_ib); + if (r < 0) + return r; - r = amdgpu_ctx_get_entity(parser->ctx, chunk_ib->ip_type, - chunk_ib->ip_instance, chunk_ib->ring, - &entity); - if (r) - return r; + job = p->jobs[r]; + ring = amdgpu_job_ring(job); + ib = &job->ibs[job->num_ibs++]; - if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE) - parser->job->preamble_status |= - AMDGPU_PREAMBLE_IB_PRESENT; + /* MM engine doesn't support user fences */ + if (p->uf_entry.tv.bo && ring->funcs->no_user_fence) + return -EINVAL; - if (parser->entity && parser->entity != entity) - return -EINVAL; + if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX && + chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) { + if (chunk_ib->flags & AMDGPU_IB_FLAG_CE) + (*ce_preempt)++; + else + (*de_preempt)++; - /* Return if there is no run queue associated with this entity. - * Possibly because of disabled HW IP*/ - if (entity->rq == NULL) + /* Each GFX command submit allows only 1 IB max + * preemptible for CE & DE */ + if (*ce_preempt > 1 || *de_preempt > 1) return -EINVAL; + } - parser->entity = entity; - - ring = to_amdgpu_ring(entity->rq->sched); - r = amdgpu_ib_get(adev, vm, ring->funcs->parse_cs ? - chunk_ib->ib_bytes : 0, - AMDGPU_IB_POOL_DELAYED, ib); - if (r) { - DRM_ERROR("Failed to get ib !\n"); - return r; - } - - ib->gpu_addr = chunk_ib->va_start; - ib->length_dw = chunk_ib->ib_bytes / 4; - ib->flags = chunk_ib->flags; + if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE) + job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT; - j++; + r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ? + chunk_ib->ib_bytes : 0, + AMDGPU_IB_POOL_DELAYED, ib); + if (r) { + DRM_ERROR("Failed to get ib !\n"); + return r; } - /* MM engine doesn't support user fences */ - ring = to_amdgpu_ring(parser->entity->rq->sched); - if (parser->job->uf_addr && ring->funcs->no_user_fence) - return -EINVAL; - + ib->gpu_addr = chunk_ib->va_start; + ib->length_dw = chunk_ib->ib_bytes / 4; + ib->flags = chunk_ib->flags; return 0; } -static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, - struct amdgpu_cs_chunk *chunk) +static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, + struct amdgpu_cs_chunk *chunk) { + struct drm_amdgpu_cs_chunk_dep *deps = chunk->kdata; struct amdgpu_fpriv *fpriv = p->filp->driver_priv; unsigned num_deps; int i, r; - struct drm_amdgpu_cs_chunk_dep *deps; - deps = (struct drm_amdgpu_cs_chunk_dep *)chunk->kdata; num_deps = chunk->length_dw * 4 / sizeof(struct drm_amdgpu_cs_chunk_dep); @@ -402,7 +430,7 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, dma_fence_put(old); } - r = amdgpu_sync_fence(&p->job->sync, fence); + r = amdgpu_sync_fence(&p->gang_leader->sync, fence); dma_fence_put(fence); if (r) return r; @@ -410,9 +438,9 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, return 0; } -static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p, - uint32_t handle, u64 point, - u64 flags) +static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, + uint32_t handle, u64 point, + u64 flags) { struct dma_fence *fence; int r; @@ -424,25 +452,23 @@ static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p, return r; } - r = amdgpu_sync_fence(&p->job->sync, fence); + r = amdgpu_sync_fence(&p->gang_leader->sync, fence); dma_fence_put(fence); return r; } -static int amdgpu_cs_process_syncobj_in_dep(struct amdgpu_cs_parser *p, - struct amdgpu_cs_chunk *chunk) +static int amdgpu_cs_p2_syncobj_in(struct amdgpu_cs_parser *p, + struct amdgpu_cs_chunk *chunk) { - struct drm_amdgpu_cs_chunk_sem *deps; + struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata; unsigned num_deps; int i, r; - deps = (struct drm_amdgpu_cs_chunk_sem *)chunk->kdata; num_deps = chunk->length_dw * 4 / sizeof(struct drm_amdgpu_cs_chunk_sem); for (i = 0; i < num_deps; ++i) { - r = amdgpu_syncobj_lookup_and_add_to_sync(p, deps[i].handle, - 0, 0); + r = amdgpu_syncobj_lookup_and_add(p, deps[i].handle, 0, 0); if (r) return r; } @@ -450,21 +476,19 @@ static int amdgpu_cs_process_syncobj_in_dep(struct amdgpu_cs_parser *p, return 0; } -static int amdgpu_cs_process_syncobj_timeline_in_dep(struct amdgpu_cs_parser *p, - struct amdgpu_cs_chunk *chunk) +static int amdgpu_cs_p2_syncobj_timeline_wait(struct amdgpu_cs_parser *p, + struct amdgpu_cs_chunk *chunk) { - struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps; + struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata; unsigned num_deps; int i, r; - syncobj_deps = (struct drm_amdgpu_cs_chunk_syncobj *)chunk->kdata; num_deps = chunk->length_dw * 4 / sizeof(struct drm_amdgpu_cs_chunk_syncobj); for (i = 0; i < num_deps; ++i) { - r = amdgpu_syncobj_lookup_and_add_to_sync(p, - syncobj_deps[i].handle, - syncobj_deps[i].point, - syncobj_deps[i].flags); + r = amdgpu_syncobj_lookup_and_add(p, syncobj_deps[i].handle, + syncobj_deps[i].point, + syncobj_deps[i].flags); if (r) return r; } @@ -472,14 +496,13 @@ static int amdgpu_cs_process_syncobj_timeline_in_dep(struct amdgpu_cs_parser *p, return 0; } -static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p, - struct amdgpu_cs_chunk *chunk) +static int amdgpu_cs_p2_syncobj_out(struct amdgpu_cs_parser *p, + struct amdgpu_cs_chunk *chunk) { - struct drm_amdgpu_cs_chunk_sem *deps; + struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata; unsigned num_deps; int i; - deps = (struct drm_amdgpu_cs_chunk_sem *)chunk->kdata; num_deps = chunk->length_dw * 4 / sizeof(struct drm_amdgpu_cs_chunk_sem); @@ -507,15 +530,13 @@ static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p, return 0; } - -static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p, - struct amdgpu_cs_chunk *chunk) +static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p, + struct amdgpu_cs_chunk *chunk) { - struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps; + struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata; unsigned num_deps; int i; - syncobj_deps = (struct drm_amdgpu_cs_chunk_syncobj *)chunk->kdata; num_deps = chunk->length_dw * 4 / sizeof(struct drm_amdgpu_cs_chunk_syncobj); @@ -552,9 +573,9 @@ static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p return 0; } -static int amdgpu_cs_dependencies(struct amdgpu_device *adev, - struct amdgpu_cs_parser *p) +static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p) { + unsigned int ce_preempt = 0, de_preempt = 0; int i, r; for (i = 0; i < p->nchunks; ++i) { @@ -563,29 +584,34 @@ static int amdgpu_cs_dependencies(struct amdgpu_device *adev, chunk = &p->chunks[i]; switch (chunk->chunk_id) { + case AMDGPU_CHUNK_ID_IB: + r = amdgpu_cs_p2_ib(p, chunk, &ce_preempt, &de_preempt); + if (r) + return r; + break; case AMDGPU_CHUNK_ID_DEPENDENCIES: case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES: - r = amdgpu_cs_process_fence_dep(p, chunk); + r = amdgpu_cs_p2_dependencies(p, chunk); if (r) return r; break; case AMDGPU_CHUNK_ID_SYNCOBJ_IN: - r = amdgpu_cs_process_syncobj_in_dep(p, chunk); + r = amdgpu_cs_p2_syncobj_in(p, chunk); if (r) return r; break; case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: - r = amdgpu_cs_process_syncobj_out_dep(p, chunk); + r = amdgpu_cs_p2_syncobj_out(p, chunk); if (r) return r; break; case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT: - r = amdgpu_cs_process_syncobj_timeline_in_dep(p, chunk); + r = amdgpu_cs_p2_syncobj_timeline_wait(p, chunk); if (r) return r; break; case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL: - r = amdgpu_cs_process_syncobj_timeline_out_dep(p, chunk); + r = amdgpu_cs_p2_syncobj_timeline_signal(p, chunk); if (r) return r; break; @@ -830,6 +856,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo_list_entry *e; struct list_head duplicates; + unsigned int i; int r; INIT_LIST_HEAD(&p->validated); @@ -913,16 +940,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, e->bo_va = amdgpu_vm_bo_find(vm, bo); } - /* Move fence waiting after getting reservation lock of - * PD root. Then there is no need on a ctx mutex lock. - */ - r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity); - if (unlikely(r != 0)) { - if (r != -ERESTARTSYS) - DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); - goto error_validate; - } - amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold, &p->bytes_moved_vis_threshold); p->bytes_moved = 0; @@ -943,124 +960,138 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, if (r) goto error_validate; - amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, - p->bytes_moved_vis); - - amdgpu_job_set_resources(p->job, p->bo_list->gds_obj, - p->bo_list->gws_obj, p->bo_list->oa_obj); - - if (!r && p->uf_entry.tv.bo) { + if (p->uf_entry.tv.bo) { struct amdgpu_bo *uf = ttm_to_amdgpu_bo(p->uf_entry.tv.bo); r = amdgpu_ttm_alloc_gart(&uf->tbo); - p->job->uf_addr += amdgpu_bo_gpu_offset(uf); + if (r) + goto error_validate; + + p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(uf); } + amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, + p->bytes_moved_vis); + + for (i = 0; i < p->gang_size; ++i) + amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj, + p->bo_list->gws_obj, + p->bo_list->oa_obj); + return 0; + error_validate: - if (r) - ttm_eu_backoff_reservation(&p->ticket, &p->validated); + ttm_eu_backoff_reservation(&p->ticket, &p->validated); out_free_user_pages: - if (r) { - amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) { - struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo); + amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) { + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo); - if (!e->user_pages) - continue; - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); - kvfree(e->user_pages); - e->user_pages = NULL; - } - mutex_unlock(&p->bo_list->bo_list_mutex); + if (!e->user_pages) + continue; + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); + kvfree(e->user_pages); + e->user_pages = NULL; } return r; } -static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser) +static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p) { - int i; + int i, j; if (!trace_amdgpu_cs_enabled()) return; - for (i = 0; i < parser->job->num_ibs; i++) - trace_amdgpu_cs(parser, i); + for (i = 0; i < p->gang_size; ++i) { + struct amdgpu_job *job = p->jobs[i]; + + for (j = 0; j < job->num_ibs; ++j) + trace_amdgpu_cs(p, job, &job->ibs[j]); + } } -static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) +static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p, + struct amdgpu_job *job) { - struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched); - struct amdgpu_fpriv *fpriv = p->filp->driver_priv; - struct amdgpu_device *adev = p->adev; - struct amdgpu_vm *vm = &fpriv->vm; - struct amdgpu_bo_list_entry *e; - struct amdgpu_bo_va *bo_va; - struct amdgpu_bo *bo; + struct amdgpu_ring *ring = amdgpu_job_ring(job); + unsigned int i; int r; /* Only for UVD/VCE VM emulation */ - if (ring->funcs->parse_cs || ring->funcs->patch_cs_in_place) { - unsigned i, j; - - for (i = 0, j = 0; i < p->nchunks && j < p->job->num_ibs; i++) { - struct drm_amdgpu_cs_chunk_ib *chunk_ib; - struct amdgpu_bo_va_mapping *m; - struct amdgpu_bo *aobj = NULL; - struct amdgpu_cs_chunk *chunk; - uint64_t offset, va_start; - struct amdgpu_ib *ib; - uint8_t *kptr; - - chunk = &p->chunks[i]; - ib = &p->job->ibs[j]; - chunk_ib = chunk->kdata; - - if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB) - continue; + if (!ring->funcs->parse_cs && !ring->funcs->patch_cs_in_place) + return 0; - va_start = chunk_ib->va_start & AMDGPU_GMC_HOLE_MASK; - r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m); - if (r) { - DRM_ERROR("IB va_start is invalid\n"); - return r; - } + for (i = 0; i < job->num_ibs; ++i) { + struct amdgpu_ib *ib = &job->ibs[i]; + struct amdgpu_bo_va_mapping *m; + struct amdgpu_bo *aobj; + uint64_t va_start; + uint8_t *kptr; - if ((va_start + chunk_ib->ib_bytes) > - (m->last + 1) * AMDGPU_GPU_PAGE_SIZE) { - DRM_ERROR("IB va_start+ib_bytes is invalid\n"); - return -EINVAL; - } + va_start = ib->gpu_addr & AMDGPU_GMC_HOLE_MASK; + r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m); + if (r) { + DRM_ERROR("IB va_start is invalid\n"); + return r; + } - /* the IB should be reserved at this point */ - r = amdgpu_bo_kmap(aobj, (void **)&kptr); - if (r) { - return r; - } + if ((va_start + ib->length_dw * 4) > + (m->last + 1) * AMDGPU_GPU_PAGE_SIZE) { + DRM_ERROR("IB va_start+ib_bytes is invalid\n"); + return -EINVAL; + } - offset = m->start * AMDGPU_GPU_PAGE_SIZE; - kptr += va_start - offset; - - if (ring->funcs->parse_cs) { - memcpy(ib->ptr, kptr, chunk_ib->ib_bytes); - amdgpu_bo_kunmap(aobj); - - r = amdgpu_ring_parse_cs(ring, p, p->job, ib); - if (r) - return r; - } else { - ib->ptr = (uint32_t *)kptr; - r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib); - amdgpu_bo_kunmap(aobj); - if (r) - return r; - } + /* the IB should be reserved at this point */ + r = amdgpu_bo_kmap(aobj, (void **)&kptr); + if (r) { + return r; + } + + kptr += va_start - (m->start * AMDGPU_GPU_PAGE_SIZE); - j++; + if (ring->funcs->parse_cs) { + memcpy(ib->ptr, kptr, ib->length_dw * 4); + amdgpu_bo_kunmap(aobj); + + r = amdgpu_ring_parse_cs(ring, p, job, ib); + if (r) + return r; + } else { + ib->ptr = (uint32_t *)kptr; + r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib); + amdgpu_bo_kunmap(aobj); + if (r) + return r; } } - if (!p->job->vm) - return 0; + return 0; +} + +static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p) +{ + unsigned int i; + int r; + + for (i = 0; i < p->gang_size; ++i) { + r = amdgpu_cs_patch_ibs(p, p->jobs[i]); + if (r) + return r; + } + return 0; +} + +static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) +{ + struct amdgpu_fpriv *fpriv = p->filp->driver_priv; + struct amdgpu_job *job = p->gang_leader; + struct amdgpu_device *adev = p->adev; + struct amdgpu_vm *vm = &fpriv->vm; + struct amdgpu_bo_list_entry *e; + struct amdgpu_bo_va *bo_va; + struct amdgpu_bo *bo; + unsigned int i; + int r; r = amdgpu_vm_clear_freed(adev, vm, NULL); if (r) @@ -1070,7 +1101,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(&p->job->sync, fpriv->prt_va->last_pt_update); + r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update); if (r) return r; @@ -1081,7 +1112,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update); + r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); if (r) return r; } @@ -1100,7 +1131,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update); + r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); if (r) return r; } @@ -1113,11 +1144,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(&p->job->sync, vm->last_update); + r = amdgpu_sync_fence(&job->sync, vm->last_update); if (r) return r; - p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo); + for (i = 0; i < p->gang_size; ++i) { + job = p->jobs[i]; + + if (!job->vm) + continue; + + job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo); + } if (amdgpu_vm_debug) { /* Invalidate all BOs to test for userspace bugs */ @@ -1138,7 +1176,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; + struct amdgpu_job *leader = p->gang_leader; struct amdgpu_bo_list_entry *e; + unsigned int i; int r; list_for_each_entry(e, &p->validated, tv.head) { @@ -1148,12 +1188,23 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) sync_mode = amdgpu_bo_explicit_sync(bo) ? AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; - r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode, + r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode, &fpriv->vm); if (r) return r; } - return 0; + + for (i = 0; i < p->gang_size - 1; ++i) { + r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync); + if (r) + return r; + } + + r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_size - 1]); + if (r && r != -ERESTARTSYS) + DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); + + return r; } static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) @@ -1177,20 +1228,28 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs) { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; - struct drm_sched_entity *entity = p->entity; + struct amdgpu_job *leader = p->gang_leader; struct amdgpu_bo_list_entry *e; - struct amdgpu_job *job; + unsigned int i; uint64_t seq; int r; - job = p->job; - p->job = NULL; + for (i = 0; i < p->gang_size; ++i) + drm_sched_job_arm(&p->jobs[i]->base); - r = drm_sched_job_init(&job->base, entity, &fpriv->vm); - if (r) - goto error_unlock; + for (i = 0; i < (p->gang_size - 1); ++i) { + struct dma_fence *fence; - drm_sched_job_arm(&job->base); + fence = &p->jobs[i]->base.s_fence->scheduled; + r = amdgpu_sync_fence(&leader->sync, fence); + if (r) + goto error_cleanup; + } + + if (p->gang_size > 1) { + for (i = 0; i < p->gang_size; ++i) + amdgpu_job_set_gang_leader(p->jobs[i], leader); + } /* No memory allocation is allowed while holding the notifier lock. * The lock is held until amdgpu_cs_submit is finished and fence is @@ -1201,6 +1260,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, /* If userptr are invalidated after amdgpu_cs_parser_bos(), return * -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl. */ + r = 0; amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) { struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo); @@ -1208,62 +1268,65 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, } if (r) { r = -EAGAIN; - goto error_abort; + goto error_unlock; } - p->fence = dma_fence_get(&job->base.s_fence->finished); + p->fence = dma_fence_get(&leader->base.s_fence->finished); + list_for_each_entry(e, &p->validated, tv.head) { - seq = amdgpu_ctx_add_fence(p->ctx, entity, p->fence); + /* Everybody except for the gang leader uses READ */ + for (i = 0; i < (p->gang_size - 1); ++i) { + dma_resv_add_fence(e->tv.bo->base.resv, + &p->jobs[i]->base.s_fence->finished, + DMA_RESV_USAGE_READ); + } + + /* The gang leader is remembered as writer */ + e->tv.num_shared = 0; + } + + seq = amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_size - 1], + p->fence); amdgpu_cs_post_dependencies(p); - if ((job->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) && + if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) && !p->ctx->preamble_presented) { - job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; + leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; p->ctx->preamble_presented = true; } cs->out.handle = seq; - job->uf_sequence = seq; - - amdgpu_job_free_resources(job); + leader->uf_sequence = seq; - trace_amdgpu_cs_ioctl(job); amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->ticket); - drm_sched_entity_push_job(&job->base); + for (i = 0; i < p->gang_size; ++i) { + amdgpu_job_free_resources(p->jobs[i]); + trace_amdgpu_cs_ioctl(p->jobs[i]); + drm_sched_entity_push_job(&p->jobs[i]->base); + p->jobs[i] = NULL; + } amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm); - - /* Make sure all BOs are remembered as writers */ - amdgpu_bo_list_for_each_entry(e, p->bo_list) - e->tv.num_shared = 0; - ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence); + mutex_unlock(&p->adev->notifier_lock); mutex_unlock(&p->bo_list->bo_list_mutex); - return 0; -error_abort: - drm_sched_job_cleanup(&job->base); +error_unlock: mutex_unlock(&p->adev->notifier_lock); -error_unlock: - amdgpu_job_free(job); +error_cleanup: + for (i = 0; i < p->gang_size; ++i) + drm_sched_job_cleanup(&p->jobs[i]->base); return r; } /* Cleanup the parser structure */ -static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, - bool backoff) +static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser) { unsigned i; - if (error && backoff) { - ttm_eu_backoff_reservation(&parser->ticket, - &parser->validated); - mutex_unlock(&parser->bo_list->bo_list_mutex); - } - for (i = 0; i < parser->num_post_deps; i++) { drm_syncobj_put(parser->post_deps[i].syncobj); kfree(parser->post_deps[i].chain); @@ -1280,8 +1343,10 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, for (i = 0; i < parser->nchunks; i++) kvfree(parser->chunks[i].kdata); kvfree(parser->chunks); - if (parser->job) - amdgpu_job_free(parser->job); + for (i = 0; i < parser->gang_size; ++i) { + if (parser->jobs[i]) + amdgpu_job_free(parser->jobs[i]); + } if (parser->uf_entry.tv.bo) { struct amdgpu_bo *uf = ttm_to_amdgpu_bo(parser->uf_entry.tv.bo); @@ -1293,7 +1358,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_cs_parser parser; - bool reserved_buffers = false; int r; if (amdgpu_ras_intr_triggered()) @@ -1306,22 +1370,16 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (r) { if (printk_ratelimit()) DRM_ERROR("Failed to initialize parser %d!\n", r); - goto out; + return r; } r = amdgpu_cs_pass1(&parser, data); if (r) - goto out; + goto error_fini; - r = amdgpu_cs_ib_fill(adev, &parser); + r = amdgpu_cs_pass2(&parser); if (r) - goto out; - - r = amdgpu_cs_dependencies(adev, &parser); - if (r) { - DRM_ERROR("Failed in the dependencies handling %d!\n", r); - goto out; - } + goto error_fini; r = amdgpu_cs_parser_bos(&parser, data); if (r) { @@ -1329,25 +1387,36 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) DRM_ERROR("Not enough memory for command submission!\n"); else if (r != -ERESTARTSYS && r != -EAGAIN) DRM_ERROR("Failed to process the buffer list %d!\n", r); - goto out; + goto error_fini; } - reserved_buffers = true; - - trace_amdgpu_cs_ibs(&parser); + r = amdgpu_cs_patch_jobs(&parser); + if (r) + goto error_backoff; r = amdgpu_cs_vm_handling(&parser); if (r) - goto out; + goto error_backoff; r = amdgpu_cs_sync_rings(&parser); if (r) - goto out; + goto error_backoff; + + trace_amdgpu_cs_ibs(&parser); r = amdgpu_cs_submit(&parser, data); -out: - amdgpu_cs_parser_fini(&parser, r, reserved_buffers); + if (r) + goto error_backoff; + + amdgpu_cs_parser_fini(&parser); + return 0; + +error_backoff: + ttm_eu_backoff_reservation(&parser.ticket, &parser.validated); + mutex_unlock(&parser.bo_list->bo_list_mutex); +error_fini: + amdgpu_cs_parser_fini(&parser); return r; } |