aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_svm.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c156
1 files changed, 110 insertions, 46 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index b691c8495d66..3cb4681c5f53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1496,9 +1496,11 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
next = min(vma->vm_end, end);
npages = (next - addr) >> PAGE_SHIFT;
+ WRITE_ONCE(p->svms.faulting_task, current);
r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
addr, npages, &hmm_range,
readonly, true, owner);
+ WRITE_ONCE(p->svms.faulting_task, NULL);
if (r) {
pr_debug("failed %d to get svm range pages\n", r);
goto unreserve_out;
@@ -1572,7 +1574,6 @@ retry_flush_work:
static void svm_range_restore_work(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
- struct amdkfd_process_info *process_info;
struct svm_range_list *svms;
struct svm_range *prange;
struct kfd_process *p;
@@ -1592,12 +1593,10 @@ static void svm_range_restore_work(struct work_struct *work)
* the lifetime of this thread, kfd_process and mm will be valid.
*/
p = container_of(svms, struct kfd_process, svms);
- process_info = p->kgd_process_info;
mm = p->mm;
if (!mm)
return;
- mutex_lock(&process_info->lock);
svm_range_list_lock_and_flush_work(svms, mm);
mutex_lock(&svms->lock);
@@ -1650,7 +1649,6 @@ static void svm_range_restore_work(struct work_struct *work)
out_reschedule:
mutex_unlock(&svms->lock);
mmap_write_unlock(mm);
- mutex_unlock(&process_info->lock);
/* If validation failed, reschedule another attempt */
if (evicted_ranges) {
@@ -1966,10 +1964,16 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
struct kfd_process_device *pdd;
struct amdgpu_device *adev;
struct kfd_process *p;
+ int drain;
uint32_t i;
p = container_of(svms, struct kfd_process, svms);
+restart:
+ drain = atomic_read(&svms->drain_pagefaults);
+ if (!drain)
+ return;
+
for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
pdd = p->pdds[i];
if (!pdd)
@@ -1981,6 +1985,8 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
amdgpu_ih_wait_on_checkpoint_process(adev, &adev->irq.ih1);
pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
}
+ if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
+ goto restart;
}
static void svm_range_deferred_list_work(struct work_struct *work)
@@ -1988,35 +1994,41 @@ static void svm_range_deferred_list_work(struct work_struct *work)
struct svm_range_list *svms;
struct svm_range *prange;
struct mm_struct *mm;
+ struct kfd_process *p;
svms = container_of(work, struct svm_range_list, deferred_list_work);
pr_debug("enter svms 0x%p\n", svms);
+ p = container_of(svms, struct kfd_process, svms);
+ /* Avoid mm is gone when inserting mmu notifier */
+ mm = get_task_mm(p->lead_thread);
+ if (!mm) {
+ pr_debug("svms 0x%p process mm gone\n", svms);
+ return;
+ }
+retry:
+ mmap_write_lock(mm);
+
+ /* Checking for the need to drain retry faults must be inside
+ * mmap write lock to serialize with munmap notifiers.
+ */
+ if (unlikely(atomic_read(&svms->drain_pagefaults))) {
+ mmap_write_unlock(mm);
+ svm_range_drain_retry_fault(svms);
+ goto retry;
+ }
+
spin_lock(&svms->deferred_list_lock);
while (!list_empty(&svms->deferred_range_list)) {
prange = list_first_entry(&svms->deferred_range_list,
struct svm_range, deferred_list);
+ list_del_init(&prange->deferred_list);
spin_unlock(&svms->deferred_list_lock);
+
pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
prange->start, prange->last, prange->work_item.op);
- /* Make sure no stale retry fault coming after range is freed */
- if (prange->work_item.op == SVM_OP_UNMAP_RANGE)
- svm_range_drain_retry_fault(prange->svms);
-
- mm = prange->work_item.mm;
- mmap_write_lock(mm);
mutex_lock(&svms->lock);
-
- /* Remove from deferred_list must be inside mmap write lock,
- * otherwise, svm_range_list_lock_and_flush_work may hold mmap
- * write lock, and continue because deferred_list is empty, then
- * deferred_list handle is blocked by mmap write lock.
- */
- spin_lock(&svms->deferred_list_lock);
- list_del_init(&prange->deferred_list);
- spin_unlock(&svms->deferred_list_lock);
-
mutex_lock(&prange->migrate_mutex);
while (!list_empty(&prange->child_list)) {
struct svm_range *pchild;
@@ -2032,12 +2044,13 @@ static void svm_range_deferred_list_work(struct work_struct *work)
svm_range_handle_list_op(svms, prange);
mutex_unlock(&svms->lock);
- mmap_write_unlock(mm);
spin_lock(&svms->deferred_list_lock);
}
spin_unlock(&svms->deferred_list_lock);
+ mmap_write_unlock(mm);
+ mmput(mm);
pr_debug("exit svms 0x%p\n", svms);
}
@@ -2124,6 +2137,12 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
prange, prange->start, prange->last, start, last);
+ /* Make sure pending page faults are drained in the deferred worker
+ * before the range is freed to avoid straggler interrupts on
+ * unmapped memory causing "phantom faults".
+ */
+ atomic_inc(&svms->drain_pagefaults);
+
unmap_parent = start <= prange->start && last >= prange->last;
list_for_each_entry(pchild, &prange->child_list, child_list) {
@@ -2261,7 +2280,7 @@ svm_range_from_addr(struct svm_range_list *svms, unsigned long addr,
* migration if actual loc is not best location, then update GPU page table
* mapping to the best location.
*
- * If vm fault gpu is range preferred loc, the best_loc is preferred loc.
+ * If the preferred loc is accessible by faulting GPU, use preferred loc.
* If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu
* If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then
* if range actual loc is cpu, best_loc is cpu
@@ -2278,7 +2297,7 @@ svm_range_best_restore_location(struct svm_range *prange,
struct amdgpu_device *adev,
int32_t *gpuidx)
{
- struct amdgpu_device *bo_adev;
+ struct amdgpu_device *bo_adev, *preferred_adev;
struct kfd_process *p;
uint32_t gpuid;
int r;
@@ -2291,8 +2310,16 @@ svm_range_best_restore_location(struct svm_range *prange,
return -1;
}
- if (prange->preferred_loc == gpuid)
+ if (prange->preferred_loc == gpuid ||
+ prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) {
return prange->preferred_loc;
+ } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
+ preferred_adev = svm_range_get_adev_by_id(prange,
+ prange->preferred_loc);
+ if (amdgpu_xgmi_same_hive(adev, preferred_adev))
+ return prange->preferred_loc;
+ /* fall through */
+ }
if (test_bit(*gpuidx, prange->bitmap_access))
return gpuid;
@@ -2313,7 +2340,8 @@ svm_range_best_restore_location(struct svm_range *prange,
static int
svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
- unsigned long *start, unsigned long *last)
+ unsigned long *start, unsigned long *last,
+ bool *is_heap_stack)
{
struct vm_area_struct *vma;
struct interval_tree_node *node;
@@ -2324,6 +2352,12 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
pr_debug("VMA does not exist in address [0x%llx]\n", addr);
return -EFAULT;
}
+
+ *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk &&
+ vma->vm_end >= vma->vm_mm->start_brk) ||
+ (vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack);
+
start_limit = max(vma->vm_start >> PAGE_SHIFT,
(unsigned long)ALIGN_DOWN(addr, 2UL << 8));
end_limit = min(vma->vm_end >> PAGE_SHIFT,
@@ -2353,9 +2387,9 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
*start = start_limit;
*last = end_limit - 1;
- pr_debug("vma start: 0x%lx start: 0x%lx vma end: 0x%lx last: 0x%lx\n",
- vma->vm_start >> PAGE_SHIFT, *start,
- vma->vm_end >> PAGE_SHIFT, *last);
+ pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n",
+ vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT,
+ *start, *last, *is_heap_stack);
return 0;
}
@@ -2420,11 +2454,13 @@ svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
struct svm_range *prange = NULL;
unsigned long start, last;
uint32_t gpuid, gpuidx;
+ bool is_heap_stack;
uint64_t bo_s = 0;
uint64_t bo_l = 0;
int r;
- if (svm_range_get_range_boundaries(p, addr, &start, &last))
+ if (svm_range_get_range_boundaries(p, addr, &start, &last,
+ &is_heap_stack))
return NULL;
r = svm_range_check_vm(p, start, last, &bo_s, &bo_l);
@@ -2451,6 +2487,9 @@ svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
return NULL;
}
+ if (is_heap_stack)
+ prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM;
+
svm_range_add_to_svms(prange);
svm_range_add_notifier_locked(mm, prange);
@@ -2523,20 +2562,13 @@ svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
}
static bool
-svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool write_fault)
+svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
{
unsigned long requested = VM_READ;
- struct vm_area_struct *vma;
if (write_fault)
requested |= VM_WRITE;
- vma = find_vma(mm, addr << PAGE_SHIFT);
- if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
- pr_debug("address 0x%llx VMA is removed\n", addr);
- return true;
- }
-
pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
vma->vm_flags);
return (vma->vm_flags & requested) == requested;
@@ -2554,6 +2586,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
int32_t best_loc;
int32_t gpuidx = MAX_GPU_INSTANCE;
bool write_locked = false;
+ struct vm_area_struct *vma;
int r = 0;
if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
@@ -2564,7 +2597,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
p = kfd_lookup_process_by_pasid(pasid);
if (!p) {
pr_debug("kfd process not founded pasid 0x%x\n", pasid);
- return -ESRCH;
+ return 0;
}
if (!p->xnack_enabled) {
pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
@@ -2575,10 +2608,19 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
+ if (atomic_read(&svms->drain_pagefaults)) {
+ pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
+ r = 0;
+ goto out;
+ }
+
+ /* p->lead_thread is available as kfd_process_wq_release flush the work
+ * before releasing task ref.
+ */
mm = get_task_mm(p->lead_thread);
if (!mm) {
pr_debug("svms 0x%p failed to get mm\n", svms);
- r = -ESRCH;
+ r = 0;
goto out;
}
@@ -2616,6 +2658,7 @@ retry_write_locked:
if (svm_range_skip_recover(prange)) {
amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
+ r = 0;
goto out_unlock_range;
}
@@ -2624,10 +2667,21 @@ retry_write_locked:
if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
svms, prange->start, prange->last);
+ r = 0;
+ goto out_unlock_range;
+ }
+
+ /* __do_munmap removed VMA, return success as we are handling stale
+ * retry fault.
+ */
+ vma = find_vma(mm, addr << PAGE_SHIFT);
+ if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
+ pr_debug("address 0x%llx VMA is removed\n", addr);
+ r = 0;
goto out_unlock_range;
}
- if (!svm_fault_allowed(mm, addr, write_fault)) {
+ if (!svm_fault_allowed(vma, write_fault)) {
pr_debug("fault addr 0x%llx no %s permission\n", addr,
write_fault ? "write" : "read");
r = -EPERM;
@@ -2705,6 +2759,14 @@ void svm_range_list_fini(struct kfd_process *p)
/* Ensure list work is finished before process is destroyed */
flush_work(&p->svms.deferred_list_work);
+ /*
+ * Ensure no retry fault comes in afterwards, as page fault handler will
+ * not find kfd process and take mm lock to recover fault.
+ */
+ atomic_inc(&p->svms.drain_pagefaults);
+ svm_range_drain_retry_fault(&p->svms);
+
+
list_for_each_entry_safe(prange, next, &p->svms.list, list) {
svm_range_unlink(prange);
svm_range_remove_notifier(prange);
@@ -2725,6 +2787,7 @@ int svm_range_list_init(struct kfd_process *p)
mutex_init(&svms->lock);
INIT_LIST_HEAD(&svms->list);
atomic_set(&svms->evicted_ranges, 0);
+ atomic_set(&svms->drain_pagefaults, 0);
INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
INIT_LIST_HEAD(&svms->deferred_range_list);
@@ -3076,6 +3139,8 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
struct svm_range *prange =
list_first_entry(&svm_bo->range_list,
struct svm_range, svm_bo_list);
+ int retries = 3;
+
list_del_init(&prange->svm_bo_list);
spin_unlock(&svm_bo->list_lock);
@@ -3083,7 +3148,11 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
prange->start, prange->last);
mutex_lock(&prange->migrate_mutex);
- svm_migrate_vram_to_ram(prange, svm_bo->eviction_fence->mm);
+ do {
+ svm_migrate_vram_to_ram(prange,
+ svm_bo->eviction_fence->mm);
+ } while (prange->actual_loc && --retries);
+ WARN(prange->actual_loc, "Migration failed during eviction");
mutex_lock(&prange->lock);
prange->svm_bo = NULL;
@@ -3108,7 +3177,6 @@ static int
svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
{
- struct amdkfd_process_info *process_info = p->kgd_process_info;
struct mm_struct *mm = current->mm;
struct list_head update_list;
struct list_head insert_list;
@@ -3127,8 +3195,6 @@ svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
svms = &p->svms;
- mutex_lock(&process_info->lock);
-
svm_range_list_lock_and_flush_work(svms, mm);
r = svm_range_is_valid(p, start, size);
@@ -3204,8 +3270,6 @@ out_unlock_range:
mutex_unlock(&svms->lock);
mmap_read_unlock(mm);
out:
- mutex_unlock(&process_info->lock);
-
pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
&p->svms, start, start + size - 1, r);