- Alistair Popple has a series which addresses a race which causes page

refcounting errors in ZONE_DEVICE pages.
 
 - Peter Xu fixes some userfaultfd test harness instability.
 
 - Various other patches in MM, mainly fixes.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCY0j6igAKCRDdBJ7gKXxA
 jnGxAP99bV39ZtOsoY4OHdZlWU16BUjKuf/cb3bZlC2G849vEwD+OKlij86SG20j
 MGJQ6TfULJ8f1dnQDd6wvDfl3FMl7Qc=
 =tbdp
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2022-10-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull more MM updates from Andrew Morton:

 - fix a race which causes page refcounting errors in ZONE_DEVICE pages
   (Alistair Popple)

 - fix userfaultfd test harness instability (Peter Xu)

 - various other patches in MM, mainly fixes

* tag 'mm-stable-2022-10-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (29 commits)
  highmem: fix kmap_to_page() for kmap_local_page() addresses
  mm/page_alloc: fix incorrect PGFREE and PGALLOC for high-order page
  mm/selftest: uffd: explain the write missing fault check
  mm/hugetlb: use hugetlb_pte_stable in migration race check
  mm/hugetlb: fix race condition of uffd missing/minor handling
  zram: always expose rw_page
  LoongArch: update local TLB if PTE entry exists
  mm: use update_mmu_tlb() on the second thread
  kasan: fix array-bounds warnings in tests
  hmm-tests: add test for migrate_device_range()
  nouveau/dmem: evict device private memory during release
  nouveau/dmem: refactor nouveau_dmem_fault_copy_one()
  mm/migrate_device.c: add migrate_device_range()
  mm/migrate_device.c: refactor migrate_vma and migrate_deivce_coherent_page()
  mm/memremap.c: take a pgmap reference on page allocation
  mm: free device private pages have zero refcount
  mm/memory.c: fix race when faulting a device private page
  mm/damon: use damon_sz_region() in appropriate place
  mm/damon: move sz_damon_region to damon_sz_region
  lib/test_meminit: add checks for the allocation functions
  ...
This commit is contained in:
Linus Torvalds 2022-10-14 12:28:43 -07:00
commit 5e714bf171
32 changed files with 723 additions and 257 deletions

View file

@ -412,6 +412,9 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
__update_tlb(vma, address, ptep);
}
#define __HAVE_ARCH_UPDATE_MMU_TLB
#define update_mmu_tlb update_mmu_cache
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{

View file

@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
unsigned long start,
unsigned long end, unsigned long page_shift,
struct kvm *kvm, unsigned long gpa)
struct kvm *kvm, unsigned long gpa, struct page *fault_page)
{
unsigned long src_pfn, dst_pfn = 0;
struct migrate_vma mig;
struct migrate_vma mig = { 0 };
struct page *dpage, *spage;
struct kvmppc_uvmem_page_pvt *pvt;
unsigned long pfn;
@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
mig.dst = &dst_pfn;
mig.pgmap_owner = &kvmppc_uvmem_pgmap;
mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
mig.fault_page = fault_page;
/* The requested page is already paged-out, nothing to do */
if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
@ -580,12 +581,14 @@ out_finalize:
static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long page_shift,
struct kvm *kvm, unsigned long gpa)
struct kvm *kvm, unsigned long gpa,
struct page *fault_page)
{
int ret;
mutex_lock(&kvm->arch.uvmem_lock);
ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa,
fault_page);
mutex_unlock(&kvm->arch.uvmem_lock);
return ret;
@ -634,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
pvt->remove_gfn = true;
if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE,
PAGE_SHIFT, kvm, pvt->gpa))
PAGE_SHIFT, kvm, pvt->gpa, NULL))
pr_err("Can't page out gpa:0x%lx addr:0x%lx\n",
pvt->gpa, addr);
} else {
@ -715,7 +718,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
dpage = pfn_to_page(uvmem_pfn);
dpage->zone_device_data = pvt;
lock_page(dpage);
zone_device_page_init(dpage);
return dpage;
out_clear:
spin_lock(&kvmppc_uvmem_bitmap_lock);
@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
bool pagein)
{
unsigned long src_pfn, dst_pfn = 0;
struct migrate_vma mig;
struct migrate_vma mig = { 0 };
struct page *spage;
unsigned long pfn;
struct page *dpage;
@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
if (kvmppc_svm_page_out(vmf->vma, vmf->address,
vmf->address + PAGE_SIZE, PAGE_SHIFT,
pvt->kvm, pvt->gpa))
pvt->kvm, pvt->gpa, vmf->page))
return VM_FAULT_SIGBUS;
else
return 0;
@ -1065,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
if (!vma || vma->vm_start > start || vma->vm_end < end)
goto out;
if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa))
if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL))
ret = H_SUCCESS;
out:
mmap_read_unlock(kvm->mm);

View file

@ -52,9 +52,6 @@ static unsigned int num_devices = 1;
static size_t huge_class_size;
static const struct block_device_operations zram_devops;
#ifdef CONFIG_ZRAM_WRITEBACK
static const struct block_device_operations zram_wb_devops;
#endif
static void zram_free_page(struct zram *zram, size_t index);
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
@ -546,17 +543,6 @@ static ssize_t backing_dev_store(struct device *dev,
zram->backing_dev = backing_dev;
zram->bitmap = bitmap;
zram->nr_pages = nr_pages;
/*
* With writeback feature, zram does asynchronous IO so it's no longer
* synchronous device so let's remove synchronous io flag. Othewise,
* upper layer(e.g., swap) could wait IO completion rather than
* (submit and return), which will cause system sluggish.
* Furthermore, when the IO function returns(e.g., swap_readpage),
* upper layer expects IO was done so it could deallocate the page
* freely but in fact, IO is going on so finally could cause
* use-after-free when the IO is really done.
*/
zram->disk->fops = &zram_wb_devops;
up_write(&zram->init_lock);
pr_info("setup backing device %s\n", file_name);
@ -1270,6 +1256,9 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
struct bio_vec bvec;
zram_slot_unlock(zram, index);
/* A null bio means rw_page was used, we must fallback to bio */
if (!bio)
return -EOPNOTSUPP;
bvec.bv_page = page;
bvec.bv_len = PAGE_SIZE;
@ -1856,15 +1845,6 @@ static const struct block_device_operations zram_devops = {
.owner = THIS_MODULE
};
#ifdef CONFIG_ZRAM_WRITEBACK
static const struct block_device_operations zram_wb_devops = {
.open = zram_open,
.submit_bio = zram_submit_bio,
.swap_slot_free_notify = zram_slot_free_notify,
.owner = THIS_MODULE
};
#endif
static DEVICE_ATTR_WO(compact);
static DEVICE_ATTR_RW(disksize);
static DEVICE_ATTR_RO(initstate);

View file

@ -223,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
page = pfn_to_page(pfn);
svm_range_bo_ref(prange->svm_bo);
page->zone_device_data = prange->svm_bo;
lock_page(page);
zone_device_page_init(page);
}
static void
@ -410,7 +410,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
uint64_t npages = (end - start) >> PAGE_SHIFT;
struct kfd_process_device *pdd;
struct dma_fence *mfence = NULL;
struct migrate_vma migrate;
struct migrate_vma migrate = { 0 };
unsigned long cpages = 0;
dma_addr_t *scratch;
void *buf;
@ -666,7 +666,7 @@ out_oom:
static long
svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
struct vm_area_struct *vma, uint64_t start, uint64_t end,
uint32_t trigger)
uint32_t trigger, struct page *fault_page)
{
struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
uint64_t npages = (end - start) >> PAGE_SHIFT;
@ -674,7 +674,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
unsigned long cpages = 0;
struct kfd_process_device *pdd;
struct dma_fence *mfence = NULL;
struct migrate_vma migrate;
struct migrate_vma migrate = { 0 };
dma_addr_t *scratch;
void *buf;
int r = -ENOMEM;
@ -697,6 +697,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
migrate.src = buf;
migrate.dst = migrate.src + npages;
migrate.fault_page = fault_page;
scratch = (dma_addr_t *)(migrate.dst + npages);
kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid,
@ -764,7 +765,7 @@ out:
* 0 - OK, otherwise error code
*/
int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
uint32_t trigger)
uint32_t trigger, struct page *fault_page)
{
struct amdgpu_device *adev;
struct vm_area_struct *vma;
@ -805,7 +806,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
}
next = min(vma->vm_end, end);
r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger);
r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger,
fault_page);
if (r < 0) {
pr_debug("failed %ld to migrate prange %p\n", r, prange);
break;
@ -849,7 +851,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc,
pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc);
do {
r = svm_migrate_vram_to_ram(prange, mm, trigger);
r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL);
if (r)
return r;
} while (prange->actual_loc && --retries);
@ -950,7 +952,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
}
r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm,
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU);
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
vmf->page);
if (r)
pr_debug("failed %d migrate svms 0x%p range 0x%p [0x%lx 0x%lx]\n",
r, prange->svms, prange, prange->start, prange->last);

View file

@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR {
int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc,
struct mm_struct *mm, uint32_t trigger);
int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
uint32_t trigger);
uint32_t trigger, struct page *fault_page);
unsigned long
svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr);

View file

@ -2913,13 +2913,15 @@ retry_write_locked:
*/
if (prange->actual_loc)
r = svm_migrate_vram_to_ram(prange, mm,
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
NULL);
else
r = 0;
}
} else {
r = svm_migrate_vram_to_ram(prange, mm,
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
NULL);
}
if (r) {
pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
@ -3278,7 +3280,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
return 0;
if (!best_loc) {
r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
r = svm_migrate_vram_to_ram(prange, mm,
KFD_MIGRATE_TRIGGER_PREFETCH, NULL);
*migrated = !r;
return r;
}
@ -3339,7 +3342,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
mutex_lock(&prange->migrate_mutex);
do {
r = svm_migrate_vram_to_ram(prange, mm,
KFD_MIGRATE_TRIGGER_TTM_EVICTION);
KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL);
} while (!r && prange->actual_loc && --retries);
if (!r && prange->actual_loc)

View file

@ -139,44 +139,24 @@ static void nouveau_dmem_fence_done(struct nouveau_fence **fence)
}
}
static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
struct vm_fault *vmf, struct migrate_vma *args,
dma_addr_t *dma_addr)
static int nouveau_dmem_copy_one(struct nouveau_drm *drm, struct page *spage,
struct page *dpage, dma_addr_t *dma_addr)
{
struct device *dev = drm->dev->dev;
struct page *dpage, *spage;
struct nouveau_svmm *svmm;
spage = migrate_pfn_to_page(args->src[0]);
if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE))
return 0;
dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address);
if (!dpage)
return VM_FAULT_SIGBUS;
lock_page(dpage);
*dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
if (dma_mapping_error(dev, *dma_addr))
goto error_free_page;
return -EIO;
svmm = spage->zone_device_data;
mutex_lock(&svmm->mutex);
nouveau_svmm_invalidate(svmm, args->start, args->end);
if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr,
NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage)))
goto error_dma_unmap;
mutex_unlock(&svmm->mutex);
NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) {
dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
return -EIO;
}
args->dst[0] = migrate_pfn(page_to_pfn(dpage));
return 0;
error_dma_unmap:
mutex_unlock(&svmm->mutex);
dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
error_free_page:
__free_page(dpage);
return VM_FAULT_SIGBUS;
}
static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
@ -184,9 +164,11 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
struct nouveau_drm *drm = page_to_drm(vmf->page);
struct nouveau_dmem *dmem = drm->dmem;
struct nouveau_fence *fence;
struct nouveau_svmm *svmm;
struct page *spage, *dpage;
unsigned long src = 0, dst = 0;
dma_addr_t dma_addr = 0;
vm_fault_t ret;
vm_fault_t ret = 0;
struct migrate_vma args = {
.vma = vmf->vma,
.start = vmf->address,
@ -207,10 +189,26 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
if (!args.cpages)
return 0;
ret = nouveau_dmem_fault_copy_one(drm, vmf, &args, &dma_addr);
if (ret || dst == 0)
spage = migrate_pfn_to_page(src);
if (!spage || !(src & MIGRATE_PFN_MIGRATE))
goto done;
dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address);
if (!dpage)
goto done;
dst = migrate_pfn(page_to_pfn(dpage));
svmm = spage->zone_device_data;
mutex_lock(&svmm->mutex);
nouveau_svmm_invalidate(svmm, args.start, args.end);
ret = nouveau_dmem_copy_one(drm, spage, dpage, &dma_addr);
mutex_unlock(&svmm->mutex);
if (ret) {
ret = VM_FAULT_SIGBUS;
goto done;
}
nouveau_fence_new(dmem->migrate.chan, false, &fence);
migrate_vma_pages(&args);
nouveau_dmem_fence_done(&fence);
@ -326,7 +324,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
return NULL;
}
lock_page(page);
zone_device_page_init(page);
return page;
}
@ -369,6 +367,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
mutex_unlock(&drm->dmem->mutex);
}
/*
* Evict all pages mapping a chunk.
*/
static void
nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
{
unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
unsigned long *src_pfns, *dst_pfns;
dma_addr_t *dma_addrs;
struct nouveau_fence *fence;
src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
npages);
for (i = 0; i < npages; i++) {
if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
struct page *dpage;
/*
* _GFP_NOFAIL because the GPU is going away and there
* is nothing sensible we can do if we can't copy the
* data back.
*/
dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
nouveau_dmem_copy_one(chunk->drm,
migrate_pfn_to_page(src_pfns[i]), dpage,
&dma_addrs[i]);
}
}
nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
migrate_device_pages(src_pfns, dst_pfns, npages);
nouveau_dmem_fence_done(&fence);
migrate_device_finalize(src_pfns, dst_pfns, npages);
kfree(src_pfns);
kfree(dst_pfns);
for (i = 0; i < npages; i++)
dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
kfree(dma_addrs);
}
void
nouveau_dmem_fini(struct nouveau_drm *drm)
{
@ -380,8 +424,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
mutex_lock(&drm->dmem->mutex);
list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
nouveau_dmem_evict_chunk(chunk);
nouveau_bo_unpin(chunk->bo);
nouveau_bo_ref(NULL, &chunk->bo);
WARN_ON(chunk->callocated);
list_del(&chunk->list);
memunmap_pages(&chunk->pagemap);
release_mem_region(chunk->pagemap.range.start,

View file

@ -363,13 +363,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
if (page)
put_page(page);
else if (num_ra_pages > 1)

View file

@ -258,13 +258,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
if (page)
put_page(page);
else if (num_ra_pages > 1)

View file

@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t)
return list_first_entry(&t->regions_list, struct damon_region, list);
}
static inline unsigned long damon_sz_region(struct damon_region *r)
{
return r->ar.end - r->ar.start;
}
#define damon_for_each_region(r, t) \
list_for_each_entry(r, &t->regions_list, list)

View file

@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio)
}
#ifdef CONFIG_ZONE_DEVICE
void zone_device_page_init(struct page *page);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);

View file

@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES];
#ifdef CONFIG_MIGRATION
extern void putback_movable_pages(struct list_head *l);
int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode, int extra_count);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode);
extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
@ -197,11 +199,24 @@ struct migrate_vma {
*/
void *pgmap_owner;
unsigned long flags;
/*
* Set to vmf->page if this is being called to migrate a page as part of
* a migrate_to_ram() callback.
*/
struct page *fault_page;
};
int migrate_vma_setup(struct migrate_vma *args);
void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long npages);
void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
unsigned long npages);
void migrate_device_finalize(unsigned long *src_pfns,
unsigned long *dst_pfns, unsigned long npages);
#endif /* CONFIG_MIGRATION */
#endif /* _LINUX_MIGRATE_H */

View file

@ -870,8 +870,6 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
/* Per-thread vma caching: */
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif

View file

@ -100,6 +100,7 @@ struct dmirror {
struct dmirror_chunk {
struct dev_pagemap pagemap;
struct dmirror_device *mdevice;
bool remove;
};
/*
@ -192,11 +193,15 @@ static int dmirror_fops_release(struct inode *inode, struct file *filp)
return 0;
}
static struct dmirror_chunk *dmirror_page_to_chunk(struct page *page)
{
return container_of(page->pgmap, struct dmirror_chunk, pagemap);
}
static struct dmirror_device *dmirror_page_to_device(struct page *page)
{
return container_of(page->pgmap, struct dmirror_chunk,
pagemap)->mdevice;
return dmirror_page_to_chunk(page)->mdevice;
}
static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
@ -627,8 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
goto error;
}
zone_device_page_init(dpage);
dpage->zone_device_data = rpage;
lock_page(dpage);
return dpage;
error:
@ -907,7 +912,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror,
struct vm_area_struct *vma;
unsigned long src_pfns[64] = { 0 };
unsigned long dst_pfns[64] = { 0 };
struct migrate_vma args;
struct migrate_vma args = { 0 };
unsigned long next;
int ret;
@ -968,7 +973,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
unsigned long src_pfns[64] = { 0 };
unsigned long dst_pfns[64] = { 0 };
struct dmirror_bounce bounce;
struct migrate_vma args;
struct migrate_vma args = { 0 };
unsigned long next;
int ret;
@ -1218,6 +1223,85 @@ static int dmirror_snapshot(struct dmirror *dmirror,
return ret;
}
static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
{
unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT;
unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT;
unsigned long npages = end_pfn - start_pfn + 1;
unsigned long i;
unsigned long *src_pfns;
unsigned long *dst_pfns;
src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
migrate_device_range(src_pfns, start_pfn, npages);
for (i = 0; i < npages; i++) {
struct page *dpage, *spage;
spage = migrate_pfn_to_page(src_pfns[i]);
if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue;
if (WARN_ON(!is_device_private_page(spage) &&
!is_device_coherent_page(spage)))
continue;
spage = BACKING_PAGE(spage);
dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
lock_page(dpage);
copy_highpage(dpage, spage);
dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
if (src_pfns[i] & MIGRATE_PFN_WRITE)
dst_pfns[i] |= MIGRATE_PFN_WRITE;
}
migrate_device_pages(src_pfns, dst_pfns, npages);
migrate_device_finalize(src_pfns, dst_pfns, npages);
kfree(src_pfns);
kfree(dst_pfns);
}
/* Removes free pages from the free list so they can't be re-allocated */
static void dmirror_remove_free_pages(struct dmirror_chunk *devmem)
{
struct dmirror_device *mdevice = devmem->mdevice;
struct page *page;
for (page = mdevice->free_pages; page; page = page->zone_device_data)
if (dmirror_page_to_chunk(page) == devmem)
mdevice->free_pages = page->zone_device_data;
}
static void dmirror_device_remove_chunks(struct dmirror_device *mdevice)
{
unsigned int i;
mutex_lock(&mdevice->devmem_lock);
if (mdevice->devmem_chunks) {
for (i = 0; i < mdevice->devmem_count; i++) {
struct dmirror_chunk *devmem =
mdevice->devmem_chunks[i];
spin_lock(&mdevice->lock);
devmem->remove = true;
dmirror_remove_free_pages(devmem);
spin_unlock(&mdevice->lock);
dmirror_device_evict_chunk(devmem);
memunmap_pages(&devmem->pagemap);
if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
release_mem_region(devmem->pagemap.range.start,
range_len(&devmem->pagemap.range));
kfree(devmem);
}
mdevice->devmem_count = 0;
mdevice->devmem_capacity = 0;
mdevice->free_pages = NULL;
kfree(mdevice->devmem_chunks);
mdevice->devmem_chunks = NULL;
}
mutex_unlock(&mdevice->devmem_lock);
}
static long dmirror_fops_unlocked_ioctl(struct file *filp,
unsigned int command,
unsigned long arg)
@ -1272,6 +1356,11 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_snapshot(dmirror, &cmd);
break;
case HMM_DMIRROR_RELEASE:
dmirror_device_remove_chunks(dmirror->mdevice);
ret = 0;
break;
default:
return -EINVAL;
}
@ -1326,15 +1415,19 @@ static void dmirror_devmem_free(struct page *page)
mdevice = dmirror_page_to_device(page);
spin_lock(&mdevice->lock);
mdevice->cfree++;
page->zone_device_data = mdevice->free_pages;
mdevice->free_pages = page;
/* Return page to our allocator if not freeing the chunk */
if (!dmirror_page_to_chunk(page)->remove) {
mdevice->cfree++;
page->zone_device_data = mdevice->free_pages;
mdevice->free_pages = page;
}
spin_unlock(&mdevice->lock);
}
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
{
struct migrate_vma args;
struct migrate_vma args = { 0 };
unsigned long src_pfns = 0;
unsigned long dst_pfns = 0;
struct page *rpage;
@ -1357,6 +1450,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
args.dst = &dst_pfns;
args.pgmap_owner = dmirror->mdevice;
args.flags = dmirror_select_device(dmirror);
args.fault_page = vmf->page;
if (migrate_vma_setup(&args))
return VM_FAULT_SIGBUS;
@ -1407,22 +1501,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id)
static void dmirror_device_remove(struct dmirror_device *mdevice)
{
unsigned int i;
if (mdevice->devmem_chunks) {
for (i = 0; i < mdevice->devmem_count; i++) {
struct dmirror_chunk *devmem =
mdevice->devmem_chunks[i];
memunmap_pages(&devmem->pagemap);
if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
release_mem_region(devmem->pagemap.range.start,
range_len(&devmem->pagemap.range));
kfree(devmem);
}
kfree(mdevice->devmem_chunks);
}
dmirror_device_remove_chunks(mdevice);
cdev_device_del(&mdevice->cdevice, &mdevice->device);
}

View file

@ -36,6 +36,7 @@ struct hmm_dmirror_cmd {
#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_RELEASE _IOWR('H', 0x07, struct hmm_dmirror_cmd)
/*
* Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.

View file

@ -67,17 +67,24 @@ static int __init do_alloc_pages_order(int order, int *total_failures)
size_t size = PAGE_SIZE << order;
page = alloc_pages(GFP_KERNEL, order);
if (!page)
goto err;
buf = page_address(page);
fill_with_garbage(buf, size);
__free_pages(page, order);
page = alloc_pages(GFP_KERNEL, order);
if (!page)
goto err;
buf = page_address(page);
if (count_nonzero_bytes(buf, size))
(*total_failures)++;
fill_with_garbage(buf, size);
__free_pages(page, order);
return 1;
err:
(*total_failures)++;
return 1;
}
/* Test the page allocator by calling alloc_pages with different orders. */
@ -100,15 +107,22 @@ static int __init do_kmalloc_size(size_t size, int *total_failures)
void *buf;
buf = kmalloc(size, GFP_KERNEL);
if (!buf)
goto err;
fill_with_garbage(buf, size);
kfree(buf);
buf = kmalloc(size, GFP_KERNEL);
if (!buf)
goto err;
if (count_nonzero_bytes(buf, size))
(*total_failures)++;
fill_with_garbage(buf, size);
kfree(buf);
return 1;
err:
(*total_failures)++;
return 1;
}
/* Test vmalloc() with given parameters. */
@ -117,15 +131,22 @@ static int __init do_vmalloc_size(size_t size, int *total_failures)
void *buf;
buf = vmalloc(size);
if (!buf)
goto err;
fill_with_garbage(buf, size);
vfree(buf);
buf = vmalloc(size);
if (!buf)
goto err;
if (count_nonzero_bytes(buf, size))
(*total_failures)++;
fill_with_garbage(buf, size);
vfree(buf);
return 1;
err:
(*total_failures)++;
return 1;
}
/* Test kmalloc()/vmalloc() by allocating objects of different sizes. */

View file

@ -1847,7 +1847,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
set_pageblock_skip(freepage);
break;
}
}

View file

@ -491,7 +491,7 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
sz += r->ar.end - r->ar.start;
sz += damon_sz_region(r);
}
if (ctx->attrs.min_nr_regions)
@ -674,7 +674,7 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
unsigned long sz;
sz = r->ar.end - r->ar.start;
sz = damon_sz_region(r);
return s->pattern.min_sz_region <= sz &&
sz <= s->pattern.max_sz_region &&
s->pattern.min_nr_accesses <= r->nr_accesses &&
@ -702,7 +702,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
unsigned long sz = r->ar.end - r->ar.start;
unsigned long sz = damon_sz_region(r);
struct timespec64 begin, end;
unsigned long sz_applied = 0;
@ -731,14 +731,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
sz = ALIGN_DOWN(quota->charge_addr_from -
r->ar.start, DAMON_MIN_REGION);
if (!sz) {
if (r->ar.end - r->ar.start <=
DAMON_MIN_REGION)
if (damon_sz_region(r) <=
DAMON_MIN_REGION)
continue;
sz = DAMON_MIN_REGION;
}
damon_split_region_at(t, r, sz);
r = damon_next_region(r);
sz = r->ar.end - r->ar.start;
sz = damon_sz_region(r);
}
quota->charge_target_from = NULL;
quota->charge_addr_from = 0;
@ -843,8 +843,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
continue;
score = c->ops.get_scheme_score(
c, t, r, s);
quota->histogram[score] +=
r->ar.end - r->ar.start;
quota->histogram[score] += damon_sz_region(r);
if (score > max_score)
max_score = score;
}
@ -865,18 +864,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
}
}
static inline unsigned long sz_damon_region(struct damon_region *r)
{
return r->ar.end - r->ar.start;
}
/*
* Merge two adjacent regions into one region
*/
static void damon_merge_two_regions(struct damon_target *t,
struct damon_region *l, struct damon_region *r)
{
unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r);
unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r);
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
@ -905,7 +899,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
if (prev && prev->ar.end == r->ar.start &&
abs(prev->nr_accesses - r->nr_accesses) <= thres &&
sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
damon_sz_region(prev) + damon_sz_region(r) <= sz_limit)
damon_merge_two_regions(t, prev, r);
else
prev = r;
@ -963,7 +957,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs)
int i;
damon_for_each_region_safe(r, next, t) {
sz_region = r->ar.end - r->ar.start;
sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
sz_region > 2 * DAMON_MIN_REGION; i++) {

View file

@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
return -EINVAL;
orig_end = r->ar.end;
sz_orig = r->ar.end - r->ar.start;
sz_orig = damon_sz_region(r);
sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
if (!sz_piece)
@ -618,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target,
{
struct mm_struct *mm;
unsigned long start = PAGE_ALIGN(r->ar.start);
unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
unsigned long len = PAGE_ALIGN(damon_sz_region(r));
unsigned long applied;
mm = damon_get_mm(target);

View file

@ -30,6 +30,17 @@
#include <asm/tlbflush.h>
#include <linux/vmalloc.h>
#ifdef CONFIG_KMAP_LOCAL
static inline int kmap_local_calc_idx(int idx)
{
return idx + KM_MAX_IDX * smp_processor_id();
}
#ifndef arch_kmap_local_map_idx
#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
#endif
#endif /* CONFIG_KMAP_LOCAL */
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@ -142,12 +153,29 @@ pte_t *pkmap_page_table;
struct page *__kmap_to_page(void *vaddr)
{
unsigned long base = (unsigned long) vaddr & PAGE_MASK;
struct kmap_ctrl *kctrl = &current->kmap_ctrl;
unsigned long addr = (unsigned long)vaddr;
int i;
if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
int i = PKMAP_NR(addr);
/* kmap() mappings */
if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) &&
addr < PKMAP_ADDR(LAST_PKMAP)))
return pte_page(pkmap_page_table[PKMAP_NR(addr)]);
return pte_page(pkmap_page_table[i]);
/* kmap_local_page() mappings */
if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) &&
base < __fix_to_virt(FIX_KMAP_BEGIN))) {
for (i = 0; i < kctrl->idx; i++) {
unsigned long base_addr;
int idx;
idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
if (base_addr == base)
return pte_page(kctrl->pteval[i]);
}
}
return virt_to_page(vaddr);
@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void)
# define arch_kmap_local_post_unmap(vaddr) do { } while (0)
#endif
#ifndef arch_kmap_local_map_idx
#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
#endif
#ifndef arch_kmap_local_unmap_idx
#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx)
#endif
@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr)
return false;
}
static inline int kmap_local_calc_idx(int idx)
{
return idx + KM_MAX_IDX * smp_processor_id();
}
static pte_t *__kmap_pte;
static pte_t *kmap_get_pte(unsigned long vaddr, int idx)

View file

@ -5096,6 +5096,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
#ifdef CONFIG_PTE_MARKER_UFFD_WP
/*
* If the pte was wr-protected by uffd-wp in any of the
* swap forms, meanwhile the caller does not want to
@ -5107,6 +5108,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
else
#endif
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
@ -5135,11 +5137,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
#endif
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@ -5531,6 +5535,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
return handle_userfault(&vmf, reason);
}
/*
* Recheck pte with pgtable lock. Returns true if pte didn't change, or
* false if pte changed or is changing.
*/
static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
pte_t *ptep, pte_t old_pte)
{
spinlock_t *ptl;
bool same;
ptl = huge_pte_lock(h, mm, ptep);
same = pte_same(huge_ptep_get(ptep), old_pte);
spin_unlock(ptl);
return same;
}
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
@ -5571,10 +5592,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (idx >= size)
goto out;
/* Check for page in userfault range */
if (userfaultfd_missing(vma))
return hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr, address,
VM_UFFD_MISSING);
if (userfaultfd_missing(vma)) {
/*
* Since hugetlb_no_page() was examining pte
* without pgtable lock, we need to re-test under
* lock because the pte may not be stable and could
* have changed from under us. Try to detect
* either changed or during-changing ptes and retry
* properly when needed.
*
* Note that userfaultfd is actually fine with
* false positives (e.g. caused by pte changed),
* but not wrong logical events (e.g. caused by
* reading a pte during changing). The latter can
* confuse the userspace, so the strictness is very
* much preferred. E.g., MISSING event should
* never happen on the page after UFFDIO_COPY has
* correctly installed the page and returned.
*/
if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
ret = 0;
goto out;
}
return hugetlb_handle_userfault(vma, mapping, idx, flags,
haddr, address,
VM_UFFD_MISSING);
}
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
@ -5590,11 +5634,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
ptl = huge_pte_lock(h, mm, ptep);
ret = 0;
if (huge_pte_none(huge_ptep_get(ptep)))
if (hugetlb_pte_stable(h, mm, ptep, old_pte))
ret = vmf_error(PTR_ERR(page));
spin_unlock(ptl);
else
ret = 0;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@ -5640,9 +5683,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (userfaultfd_minor(vma)) {
unlock_page(page);
put_page(page);
return hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr, address,
VM_UFFD_MINOR);
/* See comment in userfaultfd_missing() block above */
if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
ret = 0;
goto out;
}
return hugetlb_handle_userfault(vma, mapping, idx, flags,
haddr, address,
VM_UFFD_MINOR);
}
}
@ -6804,7 +6852,7 @@ void hugetlb_vma_lock_release(struct kref *kref)
kfree(vma_lock);
}
void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
{
struct vm_area_struct *vma = vma_lock->vma;

View file

@ -295,6 +295,9 @@ static void krealloc_more_oob_helper(struct kunit *test,
ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
/* Suppress -Warray-bounds warnings. */
OPTIMIZER_HIDE_VAR(ptr2);
/* All offsets up to size2 must be accessible. */
ptr2[size1 - 1] = 'x';
ptr2[size1] = 'x';
@ -327,6 +330,9 @@ static void krealloc_less_oob_helper(struct kunit *test,
ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
/* Suppress -Warray-bounds warnings. */
OPTIMIZER_HIDE_VAR(ptr2);
/* Must be accessible for all modes. */
ptr2[size2 - 1] = 'x';
@ -540,13 +546,14 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
{
char *ptr;
size_t size = 64;
volatile size_t invalid_size = size;
size_t invalid_size = size;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
memset((char *)ptr, 0, 64);
OPTIMIZER_HIDE_VAR(ptr);
OPTIMIZER_HIDE_VAR(invalid_size);
KUNIT_EXPECT_KASAN_FAIL(test,
memmove((char *)ptr, (char *)ptr + 4, invalid_size));
kfree(ptr);

View file

@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte,
struct zap_details *details, pte_t pteval)
{
#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (zap_drop_file_uffd_wp(details))
return;
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
#endif
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@ -3748,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
spin_unlock(vmf->ptl);
goto out;
}
/*
* Get a page reference while we know the page can't be
* freed.
*/
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->page->pgmap->ops->migrate_to_ram(vmf);
put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_swapin_error_entry(entry)) {
@ -4118,7 +4134,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (!pte_none(*vmf->pte)) {
update_mmu_cache(vma, vmf->address, vmf->pte);
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}

View file

@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap)
int i;
percpu_ref_kill(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
pgmap->type != MEMORY_DEVICE_COHERENT)
for (i = 0; i < pgmap->nr_range; i++)
percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
wait_for_completion(&pgmap->done);
for (i = 0; i < pgmap->nr_range; i++)
@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
pgmap->type != MEMORY_DEVICE_COHERENT)
percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
@ -502,12 +507,29 @@ void free_zone_device_page(struct page *page)
page->mapping = NULL;
page->pgmap->ops->page_free(page);
/*
* Reset the page count to 1 to prepare for handing out the page again.
*/
set_page_count(page, 1);
if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
page->pgmap->type != MEMORY_DEVICE_COHERENT)
/*
* Reset the page count to 1 to prepare for handing out the page
* again.
*/
set_page_count(page, 1);
else
put_dev_pagemap(page->pgmap);
}
void zone_device_page_init(struct page *page)
{
/*
* Drivers shouldn't be allocating pages after calling
* memunmap_pages().
*/
WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
set_page_count(page, 1);
lock_page(page);
}
EXPORT_SYMBOL_GPL(zone_device_page_init);
#ifdef CONFIG_FS_DAX
bool __put_devmap_managed_page_refs(struct page *page, int refs)
{

View file

@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy);
* Migration functions
***********************************************************/
int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode, int extra_count)
{
int rc;
BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
rc = folio_migrate_mapping(mapping, dst, src, extra_count);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
if (mode != MIGRATE_SYNC_NO_COPY)
folio_migrate_copy(dst, src);
else
folio_migrate_flags(dst, src);
return MIGRATEPAGE_SUCCESS;
}
/**
* migrate_folio() - Simple folio migration.
* @mapping: The address_space containing the folio.
@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode)
{
int rc;
BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
rc = folio_migrate_mapping(mapping, dst, src, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
if (mode != MIGRATE_SYNC_NO_COPY)
folio_migrate_copy(dst, src);
else
folio_migrate_flags(dst, src);
return MIGRATEPAGE_SUCCESS;
return migrate_folio_extra(mapping, dst, src, mode, 0);
}
EXPORT_SYMBOL(migrate_folio);

View file

@ -325,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* folio_migrate_mapping(), except that here we allow migration of a
* ZONE_DEVICE page.
*/
static bool migrate_vma_check_page(struct page *page)
static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
{
/*
* One extra ref because caller holds an extra reference, either from
* isolate_lru_page() for a regular page, or migrate_vma_collect() for
* a device page.
*/
int extra = 1;
int extra = 1 + (page == fault_page);
/*
* FIXME support THP (transparent huge page), it is bit more complex to
@ -357,26 +357,20 @@ static bool migrate_vma_check_page(struct page *page)
}
/*
* migrate_vma_unmap() - replace page mapping with special migration pte entry
* @migrate: migrate struct containing all migration information
*
* Isolate pages from the LRU and replace mappings (CPU page table pte) with a
* special migration pte entry and check if it has been pinned. Pinned pages are
* restored because we cannot migrate them.
*
* This is the last step before we call the device driver callback to allocate
* destination memory and copy contents of original page over to new page.
* Unmaps pages for migration. Returns number of unmapped pages.
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
static unsigned long migrate_device_unmap(unsigned long *src_pfns,
unsigned long npages,
struct page *fault_page)
{
const unsigned long npages = migrate->npages;
unsigned long i, restore = 0;
bool allow_drain = true;
unsigned long unmapped = 0;
lru_add_drain();
for (i = 0; i < npages; i++) {
struct page *page = migrate_pfn_to_page(migrate->src[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
if (!page)
@ -391,8 +385,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
}
if (isolate_lru_page(page)) {
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
migrate->cpages--;
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
@ -405,34 +398,55 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
if (folio_mapped(folio))
try_to_migrate(folio, 0);
if (page_mapped(page) || !migrate_vma_check_page(page)) {
if (page_mapped(page) ||
!migrate_vma_check_page(page, fault_page)) {
if (!is_zone_device_page(page)) {
get_page(page);
putback_lru_page(page);
}
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
migrate->cpages--;
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
unmapped++;
}
for (i = 0; i < npages && restore; i++) {
struct page *page = migrate_pfn_to_page(migrate->src[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue;
folio = page_folio(page);
remove_migration_ptes(folio, folio, false);
migrate->src[i] = 0;
src_pfns[i] = 0;
folio_unlock(folio);
folio_put(folio);
restore--;
}
return unmapped;
}
/*
* migrate_vma_unmap() - replace page mapping with special migration pte entry
* @migrate: migrate struct containing all migration information
*
* Isolate pages from the LRU and replace mappings (CPU page table pte) with a
* special migration pte entry and check if it has been pinned. Pinned pages are
* restored because we cannot migrate them.
*
* This is the last step before we call the device driver callback to allocate
* destination memory and copy contents of original page over to new page.
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages,
migrate->fault_page);
}
/**
@ -517,6 +531,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
if (args->fault_page && !is_device_private_page(args->fault_page))
return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@ -677,42 +693,38 @@ abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
/**
* migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
* This migrates struct page meta-data from source struct page to destination
* struct page. This effectively finishes the migration from source page to the
* destination page.
*/
void migrate_vma_pages(struct migrate_vma *migrate)
static void __migrate_device_pages(unsigned long *src_pfns,
unsigned long *dst_pfns, unsigned long npages,
struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
struct mmu_notifier_range range;
unsigned long addr, i;
unsigned long i;
bool notified = false;
for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
struct page *page = migrate_pfn_to_page(migrate->src[i]);
for (i = 0; i < npages; i++) {
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
int r;
if (!newpage) {
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
if (!page) {
unsigned long addr;
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue;
/*
* The only time there is no vma is when called from
* migrate_device_coherent_page(). However this isn't
* called if the page could not be unmapped.
*/
VM_BUG_ON(!migrate->vma);
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
VM_BUG_ON(!migrate);
addr = migrate->start + i*PAGE_SIZE;
if (!notified) {
notified = true;
@ -723,7 +735,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
&migrate->src[i]);
&src_pfns[i]);
continue;
}
@ -736,21 +748,26 @@ void migrate_vma_pages(struct migrate_vma *migrate)
* device private or coherent memory.
*/
if (mapping) {
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
} else if (is_zone_device_page(newpage)) {
/*
* Other types of ZONE_DEVICE page are not supported.
*/
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
r = migrate_folio(mapping, page_folio(newpage),
page_folio(page), MIGRATE_SYNC_NO_COPY);
if (migrate && migrate->fault_page == page)
r = migrate_folio_extra(mapping, page_folio(newpage),
page_folio(page),
MIGRATE_SYNC_NO_COPY, 1);
else
r = migrate_folio(mapping, page_folio(newpage),
page_folio(page), MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
}
/*
@ -761,28 +778,56 @@ void migrate_vma_pages(struct migrate_vma *migrate)
if (notified)
mmu_notifier_invalidate_range_only_end(&range);
}
EXPORT_SYMBOL(migrate_vma_pages);
/**
* migrate_vma_finalize() - restore CPU page table entry
* migrate_device_pages() - migrate meta-data from src page to dst page
* @src_pfns: src_pfns returned from migrate_device_range()
* @dst_pfns: array of pfns allocated by the driver to migrate memory to
* @npages: number of pages in the range
*
* Equivalent to migrate_vma_pages(). This is called to migrate struct page
* meta-data from source struct page to destination.
*/
void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
unsigned long npages)
{
__migrate_device_pages(src_pfns, dst_pfns, npages, NULL);
}
EXPORT_SYMBOL(migrate_device_pages);
/**
* migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
* This replaces the special migration pte entry with either a mapping to the
* new page if migration was successful for that page, or to the original page
* otherwise.
*
* This also unlocks the pages and puts them back on the lru, or drops the extra
* refcount, for device pages.
* This migrates struct page meta-data from source struct page to destination
* struct page. This effectively finishes the migration from source page to the
* destination page.
*/
void migrate_vma_finalize(struct migrate_vma *migrate)
void migrate_vma_pages(struct migrate_vma *migrate)
{
__migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate);
}
EXPORT_SYMBOL(migrate_vma_pages);
/*
* migrate_device_finalize() - complete page migration
* @src_pfns: src_pfns returned from migrate_device_range()
* @dst_pfns: array of pfns allocated by the driver to migrate memory to
* @npages: number of pages in the range
*
* Completes migration of the page by removing special migration entries.
* Drivers must ensure copying of page data is complete and visible to the CPU
* before calling this.
*/
void migrate_device_finalize(unsigned long *src_pfns,
unsigned long *dst_pfns, unsigned long npages)
{
const unsigned long npages = migrate->npages;
unsigned long i;
for (i = 0; i < npages; i++) {
struct folio *dst, *src;
struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
struct page *page = migrate_pfn_to_page(migrate->src[i]);
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
if (!page) {
if (newpage) {
@ -792,7 +837,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
continue;
}
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
if (newpage) {
unlock_page(newpage);
put_page(newpage);
@ -819,8 +864,72 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
}
EXPORT_SYMBOL(migrate_device_finalize);
/**
* migrate_vma_finalize() - restore CPU page table entry
* @migrate: migrate struct containing all migration information
*
* This replaces the special migration pte entry with either a mapping to the
* new page if migration was successful for that page, or to the original page
* otherwise.
*
* This also unlocks the pages and puts them back on the lru, or drops the extra
* refcount, for device pages.
*/
void migrate_vma_finalize(struct migrate_vma *migrate)
{
migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
}
EXPORT_SYMBOL(migrate_vma_finalize);
/**
* migrate_device_range() - migrate device private pfns to normal memory.
* @src_pfns: array large enough to hold migrating source device private pfns.
* @start: starting pfn in the range to migrate.
* @npages: number of pages to migrate.
*
* migrate_vma_setup() is similar in concept to migrate_vma_setup() except that
* instead of looking up pages based on virtual address mappings a range of
* device pfns that should be migrated to system memory is used instead.
*
* This is useful when a driver needs to free device memory but doesn't know the
* virtual mappings of every page that may be in device memory. For example this
* is often the case when a driver is being unloaded or unbound from a device.
*
* Like migrate_vma_setup() this function will take a reference and lock any
* migrating pages that aren't free before unmapping them. Drivers may then
* allocate destination pages and start copying data from the device to CPU
* memory before calling migrate_device_pages().
*/
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long npages)
{
unsigned long i, pfn;
for (pfn = start, i = 0; i < npages; pfn++, i++) {
struct page *page = pfn_to_page(pfn);
if (!get_page_unless_zero(page)) {
src_pfns[i] = 0;
continue;
}
if (!trylock_page(page)) {
src_pfns[i] = 0;
put_page(page);
continue;
}
src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
}
migrate_device_unmap(src_pfns, npages, NULL);
return 0;
}
EXPORT_SYMBOL(migrate_device_range);
/*
* Migrate a device coherent page back to normal memory. The caller should have
* a reference on page which will be copied to the new page if migration is
@ -829,25 +938,19 @@ EXPORT_SYMBOL(migrate_vma_finalize);
int migrate_device_coherent_page(struct page *page)
{
unsigned long src_pfn, dst_pfn = 0;
struct migrate_vma args;
struct page *dpage;
WARN_ON_ONCE(PageCompound(page));
lock_page(page);
src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
args.src = &src_pfn;
args.dst = &dst_pfn;
args.cpages = 1;
args.npages = 1;
args.vma = NULL;
/*
* We don't have a VMA and don't need to walk the page tables to find
* the source page. So call migrate_vma_unmap() directly to unmap the
* page as migrate_vma_setup() will fail if args.vma == NULL.
*/
migrate_vma_unmap(&args);
migrate_device_unmap(&src_pfn, 1, NULL);
if (!(src_pfn & MIGRATE_PFN_MIGRATE))
return -EBUSY;
@ -857,10 +960,10 @@ int migrate_device_coherent_page(struct page *page)
dst_pfn = migrate_pfn(page_to_pfn(dpage));
}
migrate_vma_pages(&args);
migrate_device_pages(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
copy_highpage(dpage, page);
migrate_vma_finalize(&args);
migrate_device_finalize(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
return 0;

View file

@ -2673,7 +2673,7 @@ cannot_expand:
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
if (file)
goto unmap_and_free_vma;
goto close_and_free_vma;
else
goto free_vma;
}
@ -2742,6 +2742,9 @@ expanded:
validate_mm(mm);
return addr;
close_and_free_vma:
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
@ -2942,17 +2945,18 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
if (vma &&
(!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
mas->index = vma->vm_start;
mas->last = addr + len - 1;
vma_adjust_trans_huge(vma, addr, addr + len, 0);
mas_set_range(mas, vma->vm_start, addr + len - 1);
if (mas_preallocate(mas, vma, GFP_KERNEL))
return -ENOMEM;
vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
if (vma->anon_vma) {
anon_vma_lock_write(vma->anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
}
vma->vm_end = addr + len;
vma->vm_flags |= VM_SOFTDIRTY;
if (mas_store_gfp(mas, vma, GFP_KERNEL))
goto mas_expand_failed;
mas_store_prealloc(mas, vma);
if (vma->anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
@ -2993,13 +2997,6 @@ mas_store_fail:
vma_alloc_fail:
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
mas_expand_failed:
if (vma->anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
anon_vma_unlock_write(vma->anon_vma);
}
return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
@ -3240,6 +3237,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
out_vma_link:
if (new_vma->vm_ops && new_vma->vm_ops->close)
new_vma->vm_ops->close(new_vma);
if (new_vma->vm_file)
fput(new_vma->vm_file);
unlink_anon_vmas(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:

View file

@ -1,6 +1,7 @@
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
#include <linux/kmsan-checks.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
@ -265,6 +266,15 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
bool fullmm)
{
/*
* struct mmu_gather contains 7 1-bit fields packed into a 32-bit
* unsigned int value. The remaining 25 bits remain uninitialized
* and are never used, but KMSAN updates the origin for them in
* zap_pXX_range() in mm/memory.c, thus creating very long origin
* chains. This is technically correct, but consumes too much memory.
* Unpoisoning the whole structure will prevent creating such chains.
*/
kmsan_unpoison_memory(tlb, sizeof(*tlb));
tlb->mm = mm;
tlb->fullmm = fullmm;

View file

@ -267,6 +267,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
/*
* For file-backed mem, we need to be able to
@ -278,6 +279,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
make_pte_marker(PTE_MARKER_UFFD_WP));
pages++;
}
#endif
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();

View file

@ -3446,7 +3446,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
int pindex;
bool free_high;
__count_vm_event(PGFREE);
__count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order);
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
@ -3803,7 +3803,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp_spin_unlock_irqrestore(pcp, flags);
pcp_trylock_finish(UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
}
return page;
@ -6823,6 +6823,14 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
/*
* ZONE_DEVICE pages are released directly to the driver page allocator
* which will set the page count to 1 when allocating the page.
*/
if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
pgmap->type == MEMORY_DEVICE_COHERENT)
set_page_count(page, 0);
}
/*

View file

@ -1054,6 +1054,55 @@ TEST_F(hmm, migrate_fault)
hmm_buffer_free(buffer);
}
TEST_F(hmm, migrate_release)
{
struct hmm_buffer *buffer;
unsigned long npages;
unsigned long size;
unsigned long i;
int *ptr;
int ret;
npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
ASSERT_NE(npages, 0);
size = npages << self->page_shift;
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
buffer->fd = -1;
buffer->size = size;
buffer->mirror = malloc(size);
ASSERT_NE(buffer->mirror, NULL);
buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0);
ASSERT_NE(buffer->ptr, MAP_FAILED);
/* Initialize buffer in system memory. */
for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
ptr[i] = i;
/* Migrate memory to device. */
ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
/* Check what the device read. */
for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
ASSERT_EQ(ptr[i], i);
/* Release device memory. */
ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages);
ASSERT_EQ(ret, 0);
/* Fault pages back to system memory and check them. */
for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
ASSERT_EQ(ptr[i], i);
hmm_buffer_free(buffer);
}
/*
* Migrate anonymous shared memory to device private memory.
*/

View file

@ -774,7 +774,27 @@ static void uffd_handle_page_fault(struct uffd_msg *msg,
continue_range(uffd, msg->arg.pagefault.address, page_size);
stats->minor_faults++;
} else {
/* Missing page faults */
/*
* Missing page faults.
*
* Here we force a write check for each of the missing mode
* faults. It's guaranteed because the only threads that
* will trigger uffd faults are the locking threads, and
* their first instruction to touch the missing page will
* always be pthread_mutex_lock().
*
* Note that here we relied on an NPTL glibc impl detail to
* always read the lock type at the entry of the lock op
* (pthread_mutex_t.__data.__type, offset 0x10) before
* doing any locking operations to guarantee that. It's
* actually not good to rely on this impl detail because
* logically a pthread-compatible lib can implement the
* locks without types and we can fail when linking with
* them. However since we used to find bugs with this
* strict check we still keep it around. Hopefully this
* could be a good hint when it fails again. If one day
* it'll break on some other impl of glibc we'll revisit.
*/
if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
err("unexpected write fault");