From e77673d14f2cec6d47d2da4e58dce87c2d66e54f Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 9 Jun 2023 11:11:53 -0400 Subject: drm/amdgpu: Update invalid PTE flag setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the invalid PTE flag setting with TF enabled. This is to ensure, in addition to transitioning the retry fault to a no-retry fault, it also causes the wavefront to enter the trap handler. With the current setting, the fault only transitions to a no-retry fault. Additionally, have 2 sets of invalid PTE settings, one for TF enabled, the other for TF disabled. The setting with TF disabled, doesn't work with TF enabled. Signed-off-by: Mukul Joshi Acked-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index dea1a64be44d..24ddf6a0512a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -778,6 +778,27 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params, 1, 0, flags); } +/** + * amdgpu_vm_pte_update_noretry_flags - Update PTE no-retry flags + * + * @adev - amdgpu_device pointer + * @flags: pointer to PTE flags + * + * Update PTE no-retry flags when TF is enabled. + */ +static void amdgpu_vm_pte_update_noretry_flags(struct amdgpu_device *adev, + uint64_t *flags) +{ + /* + * Update no-retry flags with the corresponding TF + * no-retry combination. + */ + if ((*flags & AMDGPU_VM_NORETRY_FLAGS) == AMDGPU_VM_NORETRY_FLAGS) { + *flags &= ~AMDGPU_VM_NORETRY_FLAGS; + *flags |= adev->gmc.noretry_flags; + } +} + /* * amdgpu_vm_pte_update_flags - figure out flags for PTE updates * @@ -804,6 +825,16 @@ static void amdgpu_vm_pte_update_flags(struct amdgpu_vm_update_params *params, flags |= AMDGPU_PTE_EXECUTABLE; } + /* + * Update no-retry flags to use the no-retry flag combination + * with TF enabled. The AMDGPU_VM_NORETRY_FLAGS flag combination + * does not work when TF is enabled. So, replace them with + * AMDGPU_VM_NORETRY_FLAGS_TF flag combination which works for + * all cases. + */ + if (level == AMDGPU_VM_PTB) + amdgpu_vm_pte_update_noretry_flags(adev, &flags); + /* APUs mapping system memory may need different MTYPEs on different * NUMA nodes. Only do this for contiguous ranges that can be assumed * to be on the same NUMA node. -- cgit From eb58ad143dab0c9d649d702cc929f6bd4b62b455 Mon Sep 17 00:00:00 2001 From: Xiaogang Chen Date: Fri, 30 Jun 2023 11:38:35 -0500 Subject: drm/amdgpu: have bos for PDs/PTS cpu accessible when kfd uses cpu to update vm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kfd uses cpu to update vm iterates all current PDs/PTs bos, adds AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED flag and kmap them to kernel virtual address space before kfd updates the vm that was created by gfx. Signed-off-by: Xiaogang Chen Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 11 ++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 35 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 8eda8f7ac612..92a84e7b0db8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2279,16 +2279,13 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm) goto unreserve_bo; vm->update_funcs = &amdgpu_vm_cpu_funcs; + r = amdgpu_vm_pt_map_tables(adev, vm); + if (r) + goto unreserve_bo; + } else { vm->update_funcs = &amdgpu_vm_sdma_funcs; } - /* - * Make sure root PD gets mapped. As vm_update_mode could be changed - * when turning a GFX VM into a compute VM. - */ - r = vm->update_funcs->map_table(to_amdgpu_bo_vm(vm->root.bo)); - if (r) - goto unreserve_bo; dma_fence_put(vm->last_update); vm->last_update = dma_fence_get_stub(); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index b81fcb962d8f..88ee4507f6b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -497,6 +497,8 @@ void amdgpu_vm_pt_free_work(struct work_struct *work); void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m); #endif +int amdgpu_vm_pt_map_tables(struct amdgpu_device *adev, struct amdgpu_vm *vm); + /** * amdgpu_vm_tlb_seq - return tlb flush sequence number * @vm: the amdgpu_vm structure to query diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c index 31913ae86de6..6e31621452de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c @@ -31,6 +31,7 @@ */ static int amdgpu_vm_cpu_map_table(struct amdgpu_bo_vm *table) { + table->bo.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; return amdgpu_bo_kmap(&table->bo, NULL); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 24ddf6a0512a..70fc5856a5b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -1075,3 +1075,31 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, return 0; } + +/** + * amdgpu_vm_pt_map_tables - have bo of root PD cpu accessible + * @adev: amdgpu device structure + * @vm: amdgpu vm structure + * + * make root page directory and everything below it cpu accessible. + */ +int amdgpu_vm_pt_map_tables(struct amdgpu_device *adev, struct amdgpu_vm *vm) +{ + struct amdgpu_vm_pt_cursor cursor; + struct amdgpu_vm_bo_base *entry; + + for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) { + + struct amdgpu_bo_vm *bo; + int r; + + if (entry->bo) { + bo = to_amdgpu_bo_vm(entry->bo); + r = vm->update_funcs->map_table(bo); + if (r) + return r; + } + } + + return 0; +} -- cgit From 5003ca63bce63b20c02c8049be46c44135939a64 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Thu, 13 Jul 2023 15:09:37 +0800 Subject: drm/amdgpu: fix slab-out-of-bounds issue in amdgpu_vm_pt_create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent code set xcp_id stored from file private data when opening device to amdgpu bo for accounting memory usage etc, but not all VMs are attached to this fpriv structure like the vm cases in amdgpu_mes_self_test, otherwise, KASAN will complain below out of bound access. And more importantly, VM code should not touch fpriv structure, so drop fpriv code handling from amdgpu_vm_pt. [ 77.292314] BUG: KASAN: slab-out-of-bounds in amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.293845] Read of size 4 at addr ffff888102c48a48 by task modprobe/1069 [ 77.294146] Call Trace: [ 77.294178] [ 77.294208] dump_stack_lvl+0x49/0x63 [ 77.294260] print_report+0x16f/0x4a6 [ 77.294307] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.295979] ? kasan_complete_mode_report_info+0x3c/0x200 [ 77.296057] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.297556] kasan_report+0xb4/0x130 [ 77.297609] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.299202] __asan_load4+0x6f/0x90 [ 77.299272] amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] [ 77.300796] ? amdgpu_init+0x6e/0x1000 [amdgpu] [ 77.302222] ? amdgpu_vm_pt_clear+0x750/0x750 [amdgpu] [ 77.303721] ? preempt_count_sub+0x18/0xc0 [ 77.303786] amdgpu_vm_init+0x39e/0x870 [amdgpu] [ 77.305186] ? amdgpu_vm_wait_idle+0x90/0x90 [amdgpu] [ 77.306683] ? kasan_set_track+0x25/0x30 [ 77.306737] ? kasan_save_alloc_info+0x1b/0x30 [ 77.306795] ? __kasan_kmalloc+0x87/0xa0 [ 77.306852] amdgpu_mes_self_test+0x169/0x620 [amdgpu] v2: without specifying xcp partition for PD/PT bo, the xcp id is -1. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2686 Fixes: 3ebfd221c1a8 ("drm/amdkfd: Store xcp partition id to amdgpu bo") Signed-off-by: Guchun Chen Tested-by: Mikhail Gavrilov Reviewed-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 11 ++++++----- 5 files changed, 14 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 53a024cf0544..cab2fdd5b76a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1236,7 +1236,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) if (r) goto error_pasid; - r = amdgpu_vm_init(adev, &fpriv->vm); + r = amdgpu_vm_init(adev, &fpriv->vm, fpriv->xcp_id); if (r) goto error_pasid; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index e9091ebfe230..f808841310fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1382,7 +1382,7 @@ int amdgpu_mes_self_test(struct amdgpu_device *adev) goto error_pasid; } - r = amdgpu_vm_init(adev, vm); + r = amdgpu_vm_init(adev, vm, -1); if (r) { DRM_ERROR("failed to initialize vm\n"); goto error_pasid; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 32adc31c093d..74380b21e7a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2121,13 +2121,14 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout) * * @adev: amdgpu_device pointer * @vm: requested vm + * @xcp_id: GPU partition selection id * * Init @vm fields. * * Returns: * 0 for success, error for failure. */ -int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) +int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t xcp_id) { struct amdgpu_bo *root_bo; struct amdgpu_bo_vm *root; @@ -2177,7 +2178,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) vm->evicting = false; r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level, - false, &root); + false, &root, xcp_id); if (r) goto error_free_delayed; root_bo = &root->bo; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 88ee4507f6b6..bca258c38919 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -398,7 +398,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, u32 pasid); long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout); -int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm); +int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t xcp_id); int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); @@ -481,7 +481,8 @@ void amdgpu_vm_get_memory(struct amdgpu_vm *vm, int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct amdgpu_bo_vm *vmbo, bool immediate); int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, - int level, bool immediate, struct amdgpu_bo_vm **vmbo); + int level, bool immediate, struct amdgpu_bo_vm **vmbo, + int32_t xcp_id); void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm); bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev, struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 70fc5856a5b9..eb52dfe64948 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -498,11 +498,12 @@ exit: * @level: the page table level * @immediate: use a immediate update * @vmbo: pointer to the buffer object pointer + * @xcp_id: GPU partition id */ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, - int level, bool immediate, struct amdgpu_bo_vm **vmbo) + int level, bool immediate, struct amdgpu_bo_vm **vmbo, + int32_t xcp_id) { - struct amdgpu_fpriv *fpriv = container_of(vm, struct amdgpu_fpriv, vm); struct amdgpu_bo_param bp; struct amdgpu_bo *bo; struct dma_resv *resv; @@ -535,7 +536,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, bp.type = ttm_bo_type_kernel; bp.no_wait_gpu = immediate; - bp.xcp_id_plus1 = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id + 1; + bp.xcp_id_plus1 = xcp_id + 1; if (vm->root.bo) bp.resv = vm->root.bo->tbo.base.resv; @@ -561,7 +562,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, bp.type = ttm_bo_type_kernel; bp.resv = bo->tbo.base.resv; bp.bo_ptr_size = sizeof(struct amdgpu_bo); - bp.xcp_id_plus1 = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id + 1; + bp.xcp_id_plus1 = xcp_id + 1; r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow); @@ -606,7 +607,7 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, return 0; amdgpu_vm_eviction_unlock(vm); - r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, &pt); + r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, &pt, 0); amdgpu_vm_eviction_lock(vm); if (r) return r; -- cgit From e379b5e7dc7e934940290b38d033907930d37c99 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Thu, 13 Jul 2023 15:55:58 +0800 Subject: drm/amdgpu/vm: use the same xcp_id from root PD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Other PDs/PTs allocation should just use the same xcp_id as that stored in root PD. Suggested-by: Christian König Signed-off-by: Guchun Chen Reviewed-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index eb52dfe64948..83e1923f6775 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -607,7 +607,8 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, return 0; amdgpu_vm_eviction_unlock(vm); - r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, &pt, 0); + r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, &pt, + vm->root.bo->xcp_id); amdgpu_vm_eviction_lock(vm); if (r) return r; -- cgit From f135b0fc3110e39b7ac79a932bd0b7eadb337896 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 20 Jul 2023 09:05:14 +0800 Subject: drm/amdgpu: Fix one kernel-doc comment Use colon to separate parameter name from their specific meaning. silence the warning: drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c:793: warning: Function parameter or member 'adev' not described in 'amdgpu_vm_pte_update_noretry_flags' Reviewed-by: Randy Dunlap Signed-off-by: Yang Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 83e1923f6775..96d601e209b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -783,7 +783,7 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params, /** * amdgpu_vm_pte_update_noretry_flags - Update PTE no-retry flags * - * @adev - amdgpu_device pointer + * @adev: amdgpu_device pointer * @flags: pointer to PTE flags * * Update PTE no-retry flags when TF is enabled. -- cgit