aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/book3s32/hash_low.S2
-rw-r--r--arch/powerpc/mm/book3s32/kuap.c20
-rw-r--r--arch/powerpc/mm/book3s32/mmu_context.c2
-rw-r--r--arch/powerpc/mm/book3s64/hash_pgtable.c2
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c11
-rw-r--r--arch/powerpc/mm/book3s64/mmu_context.c10
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c111
-rw-r--r--arch/powerpc/mm/book3s64/pkeys.c2
-rw-r--r--arch/powerpc/mm/book3s64/radix_hugetlbpage.c1
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c639
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c283
-rw-r--r--arch/powerpc/mm/book3s64/slb.c1
-rw-r--r--arch/powerpc/mm/cacheflush.c41
-rw-r--r--arch/powerpc/mm/fault.c7
-rw-r--r--arch/powerpc/mm/init_32.c2
-rw-r--r--arch/powerpc/mm/init_64.c164
-rw-r--r--arch/powerpc/mm/ioremap.c26
-rw-r--r--arch/powerpc/mm/ioremap_32.c19
-rw-r--r--arch/powerpc/mm/ioremap_64.c12
-rw-r--r--arch/powerpc/mm/mmu_context.c8
-rw-r--r--arch/powerpc/mm/mmu_decl.h1
-rw-r--r--arch/powerpc/mm/nohash/e500_hugetlbpage.c3
-rw-r--r--arch/powerpc/mm/nohash/kup.c8
-rw-r--r--arch/powerpc/mm/nohash/tlb.c19
-rw-r--r--arch/powerpc/mm/numa.c1
-rw-r--r--arch/powerpc/mm/pgtable-frag.c73
-rw-r--r--arch/powerpc/mm/pgtable.c61
27 files changed, 1102 insertions, 427 deletions
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S
index a5a21d444e72..8b804e1a9fa4 100644
--- a/arch/powerpc/mm/book3s32/hash_low.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -14,6 +14,7 @@
* hash table, so this file is not used on them.)
*/
+#include <linux/export.h>
#include <linux/pgtable.h>
#include <linux/init.h>
#include <asm/reg.h>
@@ -22,7 +23,6 @@
#include <asm/ppc_asm.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
-#include <asm/export.h>
#include <asm/feature-fixups.h>
#include <asm/code-patching-asm.h>
diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c
index 28676cabb005..3a8815555a48 100644
--- a/arch/powerpc/mm/book3s32/kuap.c
+++ b/arch/powerpc/mm/book3s32/kuap.c
@@ -3,25 +3,11 @@
#include <asm/kup.h>
#include <asm/smp.h>
-struct static_key_false disable_kuap_key;
-EXPORT_SYMBOL(disable_kuap_key);
-
-void kuap_lock_all_ool(void)
-{
- kuap_lock_all();
-}
-EXPORT_SYMBOL(kuap_lock_all_ool);
-
-void kuap_unlock_all_ool(void)
-{
- kuap_unlock_all();
-}
-EXPORT_SYMBOL(kuap_unlock_all_ool);
-
void setup_kuap(bool disabled)
{
if (!disabled) {
- kuap_lock_all_ool();
+ update_user_segments(mfsr(0) | SR_KS);
+ isync(); /* Context sync required after mtsr() */
init_mm.context.sr0 |= SR_KS;
current->thread.sr0 |= SR_KS;
}
@@ -30,7 +16,7 @@ void setup_kuap(bool disabled)
return;
if (disabled)
- static_branch_enable(&disable_kuap_key);
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP;
else
pr_info("Activating Kernel Userspace Access Protection\n");
}
diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c
index 269a3eb25a73..1922f9a6b058 100644
--- a/arch/powerpc/mm/book3s32/mmu_context.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -71,7 +71,7 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
mm->context.id = __init_new_context();
mm->context.sr0 = CTX_TO_VSID(mm->context.id, 0);
- if (!kuep_is_disabled())
+ if (IS_ENABLED(CONFIG_PPC_KUEP))
mm->context.sr0 |= SR_NX;
if (!kuap_is_disabled())
mm->context.sr0 |= SR_KS;
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 51f48984abca..988948d69bc1 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
old = be64_to_cpu(old_be);
- trace_hugepage_update(addr, old, clr, set);
+ trace_hugepage_update_pmd(addr, old, clr, set);
if (old & H_PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index fedffe3ae136..ad2afa08e62e 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1307,18 +1307,19 @@ void hash__early_init_mmu_secondary(void)
*/
unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
{
- struct page *page;
+ struct folio *folio;
if (!pfn_valid(pte_pfn(pte)))
return pp;
- page = pte_page(pte);
+ folio = page_folio(pte_page(pte));
/* page is dirty */
- if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) {
+ if (!test_bit(PG_dcache_clean, &folio->flags) &&
+ !folio_test_reserved(folio)) {
if (trap == INTERRUPT_INST_STORAGE) {
- flush_dcache_icache_page(page);
- set_bit(PG_dcache_clean, &page->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
} else
pp |= HPTE_R_N;
}
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index c766e4c26e42..1715b07c630c 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx)
static void pmd_frag_destroy(void *pmd_frag)
{
int count;
- struct page *page;
+ struct ptdesc *ptdesc;
- page = virt_to_page(pmd_frag);
+ ptdesc = virt_to_ptdesc(pmd_frag);
/* drop all the pending references */
count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 85c84e89e3ea..8f8a62d3ff4d 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -9,6 +9,7 @@
#include <linux/memremap.h>
#include <linux/pkeys.h>
#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
#include <misc/cxl-base.h>
#include <asm/pgalloc.h>
@@ -64,11 +65,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
return changed;
}
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp, pud_t entry, int dirty)
+{
+ int changed;
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_devmap(*pudp));
+ assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
+#endif
+ changed = !pud_same(*(pudp), entry);
+ if (changed) {
+ /*
+ * We can use MMU_PAGE_1G here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pudp_ptep(pudp),
+ pud_pte(entry), address, MMU_PAGE_1G);
+ }
+ return changed;
+}
+
+
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}
+
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp)
+{
+ return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
+}
+
/*
* set a new huge pmd. We should not be called for updating
* an existing pmd entry. That should go via pmd_hugepage_update.
@@ -90,6 +119,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}
+void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pud_pte(*pudp)));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+ WARN_ON(!(pud_large(pud)));
+#endif
+ trace_hugepage_set_pud(addr, pud_val(pud));
+ return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
+}
+
static void do_serialize(void *arg)
{
/* We've taken the IPI, so try to trim the mask while here */
@@ -147,11 +193,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
return pmd;
}
+pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp, int full)
+{
+ pud_t pud;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
+ !pud_present(*pudp));
+ pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
+ return pud;
+}
+
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
}
+static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
+{
+ return __pud(pud_val(pud) | pgprot_val(pgprot));
+}
+
/*
* At some point we should be able to get rid of
* pmd_mkhuge() and mk_huge_pmd() when we update all the
@@ -166,6 +236,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
}
+pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
+{
+ unsigned long pudv;
+
+ pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
+
+ return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
+}
+
pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
{
return pfn_pmd(page_to_pfn(page), pgprot);
@@ -306,22 +385,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
{
void *ret = NULL;
- struct page *page;
+ struct ptdesc *ptdesc;
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
- page = alloc_page(gfp);
- if (!page)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
+ if (!pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- atomic_set(&page->pt_frag_refcount, 1);
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
- ret = page_address(page);
+ ret = ptdesc_address(ptdesc);
/*
* if we support only one fragment just return the
* allocated page.
@@ -331,12 +410,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
spin_lock(&mm->page_table_lock);
/*
- * If we find pgtable_page set, we return
+ * If we find ptdesc_page set, we return
* the allocated page with single fragment
* count.
*/
if (likely(!mm->context.pmd_frag)) {
- atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
+ atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR);
mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
}
spin_unlock(&mm->page_table_lock);
@@ -357,15 +436,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
void pmd_fragment_free(unsigned long *pmd)
{
- struct page *page = virt_to_page(pmd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
- if (PageReserved(page))
- return free_reserved_page(page);
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- pgtable_pmd_page_dtor(page);
- __free_page(page);
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 1d2675ab6711..125733962033 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -291,7 +291,7 @@ void setup_kuap(bool disabled)
if (smp_processor_id() == boot_cpuid) {
pr_info("Activating Kernel Userspace Access Prevention\n");
- cur_cpu_spec->mmu_features |= MMU_FTR_BOOK3S_KUAP;
+ cur_cpu_spec->mmu_features |= MMU_FTR_KUAP;
}
/*
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index 5e3195568525..17075c78d4bc 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -39,6 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st
radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize);
else
radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+ mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
}
void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e7ea492ac510..c6a4ac766b2b 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -37,7 +37,6 @@
#include <mm/mmu_decl.h>
unsigned int mmu_base_pid;
-unsigned long radix_mem_block_size __ro_after_init;
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
@@ -300,7 +299,7 @@ static int __meminit create_physical_mapping(unsigned long start,
bool prev_exec, exec = false;
pgprot_t prot;
int psize;
- unsigned long max_mapping_size = radix_mem_block_size;
+ unsigned long max_mapping_size = memory_block_size;
if (debug_pagealloc_enabled_or_kfence())
max_mapping_size = PAGE_SIZE;
@@ -502,58 +501,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
return 1;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __init probe_memory_block_size(unsigned long node, const char *uname, int
- depth, void *data)
-{
- unsigned long *mem_block_size = (unsigned long *)data;
- const __be32 *prop;
- int len;
-
- if (depth != 1)
- return 0;
-
- if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
- return 0;
-
- prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
-
- if (!prop || len < dt_root_size_cells * sizeof(__be32))
- /*
- * Nothing in the device tree
- */
- *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
- else
- *mem_block_size = of_read_number(prop, dt_root_size_cells);
- return 1;
-}
-
-static unsigned long __init radix_memory_block_size(void)
-{
- unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-
- /*
- * OPAL firmware feature is set by now. Hence we are ok
- * to test OPAL feature.
- */
- if (firmware_has_feature(FW_FEATURE_OPAL))
- mem_block_size = 1UL * 1024 * 1024 * 1024;
- else
- of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
-
- return mem_block_size;
-}
-
-#else /* CONFIG_MEMORY_HOTPLUG */
-
-static unsigned long __init radix_memory_block_size(void)
-{
- return 1UL * 1024 * 1024 * 1024;
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-
void __init radix__early_init_devtree(void)
{
int rc;
@@ -577,16 +524,6 @@ void __init radix__early_init_devtree(void)
mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
psize_to_rpti_pgsize(MMU_PAGE_64K);
}
-
- /*
- * Max mapping size used when mapping pages. We don't use
- * ppc_md.memory_block_size() here because this get called
- * early and we don't have machine probe called yet. Also
- * the pseries implementation only check for ibm,lmb-size.
- * All hypervisor supporting radix do expose that device
- * tree node.
- */
- radix_mem_block_size = radix_memory_block_size();
return;
}
@@ -601,17 +538,6 @@ void __init radix__early_init_mmu(void)
#else
mmu_virtual_psize = MMU_PAGE_4K;
#endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* vmemmap mapping */
- if (mmu_psize_defs[MMU_PAGE_2M].shift) {
- /*
- * map vmemmap using 2M if available
- */
- mmu_vmemmap_psize = MMU_PAGE_2M;
- } else
- mmu_vmemmap_psize = mmu_virtual_psize;
-#endif
#endif
/*
* initialize page table size
@@ -744,8 +670,58 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
p4d_clear(p4d);
}
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
- unsigned long end, bool direct)
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+ return !vmemmap_populated(start, PMD_SIZE);
+}
+
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+
+ return !vmemmap_populated(start, PAGE_SIZE);
+
+}
+#endif
+
+static void __meminit free_vmemmap_pages(struct page *page,
+ struct vmem_altmap *altmap,
+ int order)
+{
+ unsigned int nr_pages = 1 << order;
+
+ if (altmap) {
+ unsigned long alt_start, alt_end;
+ unsigned long base_pfn = page_to_pfn(page);
+
+ /*
+ * with 2M vmemmap mmaping we can have things setup
+ * such that even though atlmap is specified we never
+ * used altmap.
+ */
+ alt_start = altmap->base_pfn;
+ alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
+
+ if (base_pfn >= alt_start && base_pfn < alt_end) {
+ vmem_altmap_free(altmap, nr_pages);
+ return;
+ }
+ }
+
+ if (PageReserved(page)) {
+ /* allocated from memblock */
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next, pages = 0;
pte_t *pte;
@@ -759,24 +735,26 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
if (!pte_present(*pte))
continue;
- if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
- /*
- * The vmemmap_free() and remove_section_mapping()
- * codepaths call us with aligned addresses.
- */
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
+ if (!direct)
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ pages++;
}
-
- pte_clear(&init_mm, addr, pte);
- pages++;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_page_is_unused(addr, next)) {
+ free_vmemmap_pages(pte_page(*pte), altmap, 0);
+ pte_clear(&init_mm, addr, pte);
+ }
+#endif
}
if (direct)
update_page_count(mmu_virtual_psize, -pages);
}
static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
- unsigned long end, bool direct)
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next, pages = 0;
pte_t *pte_base;
@@ -790,18 +768,24 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
continue;
if (pmd_is_leaf(*pmd)) {
- if (!IS_ALIGNED(addr, PMD_SIZE) ||
- !IS_ALIGNED(next, PMD_SIZE)) {
- WARN_ONCE(1, "%s: unaligned range\n", __func__);
- continue;
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ pages++;
}
- pte_clear(&init_mm, addr, (pte_t *)pmd);
- pages++;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
+ free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
+ }
+#endif
continue;
}
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
- remove_pte_table(pte_base, addr, next, direct);
+ remove_pte_table(pte_base, addr, next, direct, altmap);
free_pte_table(pte_base, pmd);
}
if (direct)
@@ -809,7 +793,8 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
}
static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
- unsigned long end, bool direct)
+ unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long next, pages = 0;
pmd_t *pmd_base;
@@ -834,15 +819,16 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
}
pmd_base = pud_pgtable(*pud);
- remove_pmd_table(pmd_base, addr, next, direct);
+ remove_pmd_table(pmd_base, addr, next, direct, altmap);
free_pmd_table(pmd_base, pud);
}
if (direct)
update_page_count(MMU_PAGE_1G, -pages);
}
-static void __meminit remove_pagetable(unsigned long start, unsigned long end,
- bool direct)
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
{
unsigned long addr, next;
pud_t *pud_base;
@@ -871,7 +857,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end,
}
pud_base = p4d_pgtable(*p4d);
- remove_pud_table(pud_base, addr, next, direct);
+ remove_pud_table(pud_base, addr, next, direct, altmap);
free_pud_table(pud_base, p4d);
}
@@ -894,7 +880,7 @@ int __meminit radix__create_section_mapping(unsigned long start,
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
{
- remove_pagetable(start, end, true);
+ remove_pagetable(start, end, true, NULL);
return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -926,10 +912,429 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
return 0;
}
+
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+ if (radix_enabled())
+ return __vmemmap_can_optimize(altmap, pgmap);
+
+ return false;
+}
+
+int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_large(*pmdp);
+
+ if (large)
+ vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
+
+ return large;
+}
+
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+ pte_t entry;
+ pte_t *ptep = pmdp_ptep(pmdp);
+
+ VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, ptep, entry);
+ asm volatile("ptesync": : :"memory");
+
+ vmemmap_verify(ptep, node, addr, next);
+}
+
+static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
+ int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pte_t *pte = pte_offset_kernel(pmdp, addr);
+
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p;
+
+ if (!reuse) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
+ altmap = NULL;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p && altmap)
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
+ if (!p)
+ return NULL;
+ pr_debug("PAGE_SIZE vmemmap mapping\n");
+ } else {
+ /*
+ * When a PTE/PMD entry is freed from the init_mm
+ * there's a free_pages() call to this page allocated
+ * above. Thus this get_page() is paired with the
+ * put_page_testzero() on the freeing path.
+ * This can only called by certain ZONE_DEVICE path,
+ * and through vmemmap_populate_compound_pages() when
+ * slab is available.
+ */
+ get_page(reuse);
+ p = page_to_virt(reuse);
+ pr_debug("Tail page reuse vmemmap mapping\n");
+ }
+
+ VM_BUG_ON(!PAGE_ALIGNED(addr));
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ asm volatile("ptesync": : :"memory");
+ }
+ return pte;
+}
+
+static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
+ unsigned long address)
+{
+ pud_t *pud;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(p4d_none(*p4dp))) {
+ if (unlikely(!slab_is_available())) {
+ pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ p4d_populate(&init_mm, p4dp, pud);
+ /* go to the pud_offset */
+ } else
+ return pud_alloc(&init_mm, p4dp, address);
+ }
+ return pud_offset(p4dp, address);
+}
+
+static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
+ unsigned long address)
+{
+ pmd_t *pmd;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pud_none(*pudp))) {
+ if (unlikely(!slab_is_available())) {
+ pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pud_populate(&init_mm, pudp, pmd);
+ } else
+ return pmd_alloc(&init_mm, pudp, address);
+ }
+ return pmd_offset(pudp, address);
+}
+
+static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
+ unsigned long address)
+{
+ pte_t *pte;
+
+ /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+ if (unlikely(pmd_none(*pmdp))) {
+ if (unlikely(!slab_is_available())) {
+ pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+ pmd_populate(&init_mm, pmdp, pte);
+ } else
+ return pte_alloc_kernel(pmdp, address);
+ }
+ return pte_offset_kernel(pmdp, address);
+}
+
+
+
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ /*
+ * keep it simple by checking addr PMD_SIZE alignment
+ * and verifying the device boundary condition.
+ * For us to use a pmd mapping, both addr and pfn should
+ * be aligned. We skip if addr is not aligned and for
+ * pfn we hope we have extra area in the altmap that
+ * can help to find an aligned block. This can result
+ * in altmap block allocation failures, in which case
+ * we fallback to RAM for vmemmap allocation.
+ */
+ if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
+ altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
+ /*
+ * make sure we don't create altmap mappings
+ * covering things outside the device.
+ */
+ goto base_mapping;
+ }
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ pr_debug("PMD_SIZE vmemmap mapping\n");
+ continue;
+ } else if (altmap) {
+ /*
+ * A vmemmap block allocation can fail due to
+ * alignment requirements and we trying to align
+ * things aggressively there by running out of
+ * space. Try base mapping on failure.
+ */
+ goto base_mapping;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
+ /*
+ * If a huge mapping exist due to early call to
+ * vmemmap_populate, let's try to use that.
+ */
+ continue;
+ }
+base_mapping:
+ /*
+ * Not able allocate higher order memory to back memmap
+ * or we found a pointer to pte page. Allocate base page
+ * size vmemmap
+ */
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ next = addr + PAGE_SIZE;
+ }
+ return 0;
+}
+
+static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return NULL;
+ radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ return pte;
+}
+
+static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
+ unsigned long pfn_offset, int node)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long map_addr;
+
+ /* the second vmemmap page which we use for duplication */
+ map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
+ pgd = pgd_offset_k(map_addr);
+ p4d = p4d_offset(pgd, map_addr);
+ pud = vmemmap_pud_alloc(p4d, node, map_addr);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_alloc(pud, node, map_addr);
+ if (!pmd)
+ return NULL;
+ if (pmd_leaf(*pmd))
+ /*
+ * The second page is mapped as a hugepage due to a nearby request.
+ * Force our mapping to page size without deduplication
+ */
+ return NULL;
+ pte = vmemmap_pte_alloc(pmd, node, map_addr);
+ if (!pte)
+ return NULL;
+ /*
+ * Check if there exist a mapping to the left
+ */
+ if (pte_none(*pte)) {
+ /*
+ * Populate the head page vmemmap page.
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ /*
+ * Populate the tail pages vmemmap page
+ */
+ pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
+ if (!pte)
+ return NULL;
+ vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
+ return pte;
+ }
+ return pte;
+}
+
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap)
+{
+ /*
+ * we want to map things as base page size mapping so that
+ * we can save space in vmemmap. We could have huge mapping
+ * covering out both edges.
+ */
+ unsigned long addr;
+ unsigned long addr_pfn = start_pfn;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (addr = start; addr < end; addr = next) {
+
+ pgd = pgd_offset_k(addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = vmemmap_pud_alloc(p4d, node, addr);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_alloc(pud, node, addr);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pmd_leaf(READ_ONCE(*pmd))) {
+ /* existing huge mapping. Skip the range */
+ addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
+ next = pmd_addr_end(addr, end);
+ continue;
+ }
+ pte = vmemmap_pte_alloc(pmd, node, addr);
+ if (!pte)
+ return -ENOMEM;
+ if (!pte_none(*pte)) {
+ /*
+ * This could be because we already have a compound
+ * page whose VMEMMAP_RESERVE_NR pages were mapped and
+ * this request fall in those pages.
+ */
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ } else {
+ unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+ unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
+ pte_t *tail_page_pte;
+
+ /*
+ * if the address is aligned to huge page size it is the
+ * head mapping.
+ */
+ if (pfn_offset == 0) {
+ /* Populate the head page vmemmap page */
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ /*
+ * Populate the tail pages vmemmap page
+ * It can fall in different pmd, hence
+ * vmemmap_populate_address()
+ */
+ pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+
+ addr_pfn += 2;
+ next = addr + 2 * PAGE_SIZE;
+ continue;
+ }
+ /*
+ * get the 2nd mapping details
+ * Also create it if that doesn't exist
+ */
+ tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
+ if (!tail_page_pte) {
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+
+ pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ addr_pfn += 1;
+ next = addr + PAGE_SIZE;
+ continue;
+ }
+ }
+ return 0;
+}
+
+
#ifdef CONFIG_MEMORY_HOTPLUG
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
- remove_pagetable(start, start + page_size, false);
+ remove_pagetable(start, start + page_size, true, NULL);
+}
+
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ remove_pagetable(start, end, false, altmap);
}
#endif
#endif
@@ -962,7 +1367,24 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
#endif
old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
- trace_hugepage_update(addr, old, clr, set);
+ trace_hugepage_update_pmd(addr, old, clr, set);
+
+ return old;
+}
+
+unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, unsigned long clr,
+ unsigned long set)
+{
+ unsigned long old;
+
+#ifdef CONFIG_DEBUG_VM
+ WARN_ON(!pud_devmap(*pudp));
+ assert_spin_locked(pud_lockptr(mm, pudp));
+#endif
+
+ old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
+ trace_hugepage_update_pud(addr, old, clr, set);
return old;
}
@@ -1043,6 +1465,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
return old_pmd;
}
+pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old_pud;
+ unsigned long old;
+
+ old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
+ old_pud = __pud(old);
+ return old_pud;
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 0bd4866d9824..39acc2cbab4c 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -127,21 +127,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static __always_inline void __tlbie_pid_lpid(unsigned long pid,
- unsigned long lpid,
- unsigned long ric)
-{
- unsigned long rb, rs, prs, r;
-
- rb = PPC_BIT(53); /* IS = 1 */
- rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -202,23 +187,6 @@ static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
- unsigned long lpid,
- unsigned long ap, unsigned long ric)
-{
- unsigned long rb, rs, prs, r;
-
- rb = va & ~(PPC_BITMASK(52, 63));
- rb |= ap << PPC_BITLSHIFT(58);
- rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
- prs = 1; /* process scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
-}
-
static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap, unsigned long ric)
{
@@ -264,22 +232,6 @@ static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
}
}
-static inline void fixup_tlbie_va_range_lpid(unsigned long va,
- unsigned long pid,
- unsigned long lpid,
- unsigned long ap)
-{
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
- }
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
- }
-}
-
static inline void fixup_tlbie_pid(unsigned long pid)
{
/*
@@ -299,26 +251,6 @@ static inline void fixup_tlbie_pid(unsigned long pid)
}
}
-static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
-{
- /*
- * We can use any address for the invalidation, pick one which is
- * probably unused as an optimisation.
- */
- unsigned long va = ((1UL << 52) - 1);
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
- }
-
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
- asm volatile("ptesync" : : : "memory");
- __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
- RIC_FLUSH_TLB);
- }
-}
-
static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap)
{
@@ -416,31 +348,6 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
- unsigned long ric)
-{
- asm volatile("ptesync" : : : "memory");
-
- /*
- * Workaround the fact that the "ric" argument to __tlbie_pid
- * must be a compile-time contraint to match the "i" constraint
- * in the asm statement.
- */
- switch (ric) {
- case RIC_FLUSH_TLB:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
- fixup_tlbie_pid_lpid(pid, lpid);
- break;
- case RIC_FLUSH_PWC:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
- break;
- case RIC_FLUSH_ALL:
- default:
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
- fixup_tlbie_pid_lpid(pid, lpid);
- }
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
struct tlbiel_pid {
unsigned long pid;
unsigned long ric;
@@ -566,20 +473,6 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
fixup_tlbie_va_range(addr - page_size, pid, ap);
}
-static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long lpid,
- unsigned long page_size,
- unsigned long psize)
-{
- unsigned long addr;
- unsigned long ap = mmu_get_ap(psize);
-
- for (addr = start; addr < end; addr += page_size)
- __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
-
- fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
-}
-
static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
unsigned long psize, unsigned long ric)
{
@@ -660,18 +553,6 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
- unsigned long pid, unsigned long lpid,
- unsigned long page_size,
- unsigned long psize, bool also_pwc)
-{
- asm volatile("ptesync" : : : "memory");
- if (also_pwc)
- __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
- __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
-
static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
@@ -820,7 +701,7 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
* that's what the caller expects.
*/
if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
- atomic_dec(&mm->context.active_cpus);
+ dec_mm_active_cpus(mm);
cpumask_clear_cpu(cpu, mm_cpumask(mm));
always_flush = true;
}
@@ -987,6 +868,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
}
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -1020,6 +902,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
}
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
void radix__flush_all_mm(struct mm_struct *mm)
@@ -1228,6 +1111,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
}
out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
@@ -1313,7 +1197,35 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* See the comment for radix in arch_exit_mmap().
*/
if (tlb->fullmm) {
- __flush_all_mm(mm, true);
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * Shootdown based lazy tlb mm refcounting means we
+ * have to IPI everyone in the mm_cpumask anyway soon
+ * when the mm goes away, so might as well do it as
+ * part of the final flush now.
+ *
+ * If lazy shootdown was improved to reduce IPIs (e.g.,
+ * by batching), then it may end up being better to use
+ * tlbies here instead.
+ */
+ preempt_disable();
+
+ smp_mb(); /* see radix__flush_tlb_mm */
+ exit_flush_lazy_tlbs(mm);
+ _tlbiel_pid(mm->context.id, RIC_FLUSH_ALL);
+
+ /*
+ * It should not be possible to have coprocessors still
+ * attached here.
+ */
+ if (WARN_ON_ONCE(atomic_read(&mm->context.copros) > 0))
+ __flush_all_mm(mm, true);
+
+ preempt_enable();
+ } else {
+ __flush_all_mm(mm, true);
+ }
+
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->freed_tables)
radix__flush_tlb_mm(mm);
@@ -1392,6 +1304,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
}
out:
preempt_enable();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
@@ -1461,6 +1374,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
}
EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
+}
+EXPORT_SYMBOL(radix__flush_pud_tlb_range);
+
void radix__flush_tlb_all(void)
{
unsigned long rb,prs,r,rs;
@@ -1486,6 +1406,127 @@ void radix__flush_tlb_all(void)
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static __always_inline void __tlbie_pid_lpid(unsigned long pid,
+ unsigned long lpid,
+ unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
+ RIC_FLUSH_TLB);
+ }
+}
+
+static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
+ unsigned long ric)
+{
+ asm volatile("ptesync" : : : "memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ }
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
+static inline void fixup_tlbie_va_range_lpid(unsigned long va,
+ unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
+}
+
+static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync" : : : "memory");
+ if (also_pwc)
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
/*
* Performs process-scoped invalidations for a given LPID
* as part of H_RPT_INVALIDATE hcall.
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index 6956f637a38c..f2708c8629a5 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -13,6 +13,7 @@
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/paca.h>
+#include <asm/lppaca.h>
#include <asm/ppc-opcode.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c
index 0e9b4879c0f9..15189592da09 100644
--- a/arch/powerpc/mm/cacheflush.c
+++ b/arch/powerpc/mm/cacheflush.c
@@ -148,44 +148,31 @@ static void __flush_dcache_icache(void *p)
invalidate_icache_range(addr, addr + PAGE_SIZE);
}
-static void flush_dcache_icache_hugepage(struct page *page)
+void flush_dcache_icache_folio(struct folio *folio)
{
- int i;
- int nr = compound_nr(page);
+ unsigned int i, nr = folio_nr_pages(folio);
- if (!PageHighMem(page)) {
+ if (flush_coherent_icache())
+ return;
+
+ if (!folio_test_highmem(folio)) {
+ void *addr = folio_address(folio);
for (i = 0; i < nr; i++)
- __flush_dcache_icache(lowmem_page_address(page + i));
- } else {
+ __flush_dcache_icache(addr + i * PAGE_SIZE);
+ } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) {
for (i = 0; i < nr; i++) {
- void *start = kmap_local_page(page + i);
+ void *start = kmap_local_folio(folio, i * PAGE_SIZE);
__flush_dcache_icache(start);
kunmap_local(start);
}
- }
-}
-
-void flush_dcache_icache_page(struct page *page)
-{
- if (flush_coherent_icache())
- return;
-
- if (PageCompound(page))
- return flush_dcache_icache_hugepage(page);
-
- if (!PageHighMem(page)) {
- __flush_dcache_icache(lowmem_page_address(page));
- } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) {
- void *start = kmap_local_page(page);
-
- __flush_dcache_icache(start);
- kunmap_local(start);
} else {
- flush_dcache_icache_phys(page_to_phys(page));
+ unsigned long pfn = folio_pfn(folio);
+ for (i = 0; i < nr; i++)
+ flush_dcache_icache_phys((pfn + i) * PAGE_SIZE);
}
}
-EXPORT_SYMBOL(flush_dcache_icache_page);
+EXPORT_SYMBOL(flush_dcache_icache_folio);
void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
{
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 5bfdf6ecfa96..b1723094d464 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -469,7 +469,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (is_exec)
flags |= FAULT_FLAG_INSTRUCTION;
-#ifdef CONFIG_PER_VMA_LOCK
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
@@ -489,7 +488,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
@@ -501,7 +501,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
return user_mode(regs) ? 0 : SIGBUS;
lock_mmap:
-#endif /* CONFIG_PER_VMA_LOCK */
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@@ -551,9 +550,7 @@ retry:
mmap_read_unlock(current->mm);
-#ifdef CONFIG_PER_VMA_LOCK
done:
-#endif
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index d4cc3749e621..d8adc452f431 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -126,6 +126,8 @@ void __init MMU_init(void)
setup_kup();
+ update_mmu_feature_fixups(MMU_FTR_KUAP);
+
/* Shortly after that, the entire linear mapping will be available */
memblock_set_current_limit(lowmem_end_addr);
}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 0ec5b45b1e86..d96bbc001e73 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -40,6 +40,7 @@
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
#include <linux/memremap.h>
+#include <linux/memory.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
@@ -92,7 +93,7 @@ static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad
* a page table lookup here because with the hash translation we don't keep
* vmemmap details in linux page table.
*/
-static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
{
struct page *start;
unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
@@ -183,8 +184,8 @@ static __meminit int vmemmap_list_populate(unsigned long phys,
return 0;
}
-static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
- unsigned long page_size)
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+ unsigned long page_size)
{
unsigned long nr_pfn = page_size / sizeof(struct page);
unsigned long start_pfn = page_to_pfn((struct page *)start);
@@ -198,8 +199,8 @@ static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long star
return false;
}
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
- struct vmem_altmap *altmap)
+static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
{
bool altmap_alloc;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
@@ -272,6 +273,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
return 0;
}
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (radix_enabled())
+ return radix__vmemmap_populate(start, end, node, altmap);
+#endif
+
+ return __vmemmap_populate(start, end, node, altmap);
+}
+
#ifdef CONFIG_MEMORY_HOTPLUG
static unsigned long vmemmap_list_free(unsigned long start)
{
@@ -303,8 +316,8 @@ static unsigned long vmemmap_list_free(unsigned long start)
return vmem_back->phys;
}
-void __ref vmemmap_free(unsigned long start, unsigned long end,
- struct vmem_altmap *altmap)
+static void __ref __vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
{
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
unsigned long page_order = get_order(page_size);
@@ -361,6 +374,17 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
vmemmap_remove_mapping(start, page_size);
}
}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (radix_enabled())
+ return radix__vmemmap_free(start, end, altmap);
+#endif
+ return __vmemmap_free(start, end, altmap);
+}
+
#endif
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long size)
@@ -470,6 +494,130 @@ static int __init dt_scan_mmu_pid_width(unsigned long node,
return 1;
}
+/*
+ * Outside hotplug the kernel uses this value to map the kernel direct map
+ * with radix. To be compatible with older kernels, let's keep this value
+ * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map
+ * things with 1GB size in the case where we don't support hotplug.
+ */
+#ifndef CONFIG_MEMORY_HOTPLUG
+#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M
+#else
+#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE
+#endif
+
+static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size)
+{
+ unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE;
+
+ for (; *block_size > min_memory_block_size; *block_size >>= 2) {
+ if ((mem_size & *block_size) == 0)
+ break;
+ }
+}
+
+static int __init probe_memory_block_size(unsigned long node, const char *uname, int
+ depth, void *data)
+{
+ const char *type;
+ unsigned long *block_size = (unsigned long *)data;
+ const __be32 *reg, *endp;
+ int l;
+
+ if (depth != 1)
+ return 0;
+ /*
+ * If we have dynamic-reconfiguration-memory node, use the
+ * lmb value.
+ */
+ if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
+
+ const __be32 *prop;
+
+ prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l);
+
+ if (!prop || l < dt_root_size_cells * sizeof(__be32))
+ /*
+ * Nothing in the device tree
+ */
+ *block_size = DEFAULT_MEMORY_BLOCK_SIZE;
+ else
+ *block_size = of_read_number(prop, dt_root_size_cells);
+ /*
+ * We have found the final value. Don't probe further.
+ */
+ return 1;
+ }
+ /*
+ * Find all the device tree nodes of memory type and make sure
+ * the area can be mapped using the memory block size value
+ * we end up using. We start with 1G value and keep reducing
+ * it such that we can map the entire area using memory_block_size.
+ * This will be used on powernv and older pseries that don't
+ * have ibm,lmb-size node.
+ * For ex: with P5 we can end up with
+ * memory@0 -> 128MB
+ * memory@128M -> 64M
+ * This will end up using 64MB memory block size value.
+ */
+ type = of_get_flat_dt_prop(node, "device_type", NULL);
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
+ if (!reg)
+ reg = of_get_flat_dt_prop(node, "reg", &l);
+ if (!reg)
+ return 0;
+
+ endp = reg + (l / sizeof(__be32));
+ while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+ const char *compatible;
+ u64 size;
+
+ dt_mem_next_cell(dt_root_addr_cells, &reg);
+ size = dt_mem_next_cell(dt_root_size_cells, &reg);
+
+ if (size) {
+ update_memory_block_size(block_size, size);
+ continue;
+ }
+ /*
+ * ibm,coherent-device-memory with linux,usable-memory = 0
+ * Force 256MiB block size. Work around for GPUs on P9 PowerNV
+ * linux,usable-memory == 0 implies driver managed memory and
+ * we can't use large memory block size due to hotplug/unplug
+ * limitations.
+ */
+ compatible = of_get_flat_dt_prop(node, "compatible", NULL);
+ if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) {
+ if (*block_size > SZ_256M)
+ *block_size = SZ_256M;
+ /*
+ * We keep 256M as the upper limit with GPU present.
+ */
+ return 0;
+ }
+ }
+ /* continue looking for other memory device types */
+ return 0;
+}
+
+/*
+ * start with 1G memory block size. Early init will
+ * fix this with correct value.
+ */
+unsigned long memory_block_size __ro_after_init = 1UL << 30;
+static void __init early_init_memory_block_size(void)
+{
+ /*
+ * We need to do memory_block_size probe early so that
+ * radix__early_init_mmu() can use this as limit for
+ * mapping page size.
+ */
+ of_scan_flat_dt(probe_memory_block_size, &memory_block_size);
+}
+
void __init mmu_early_init_devtree(void)
{
bool hvmode = !!(mfmsr() & MSR_HV);
@@ -503,6 +651,8 @@ void __init mmu_early_init_devtree(void)
if (!hvmode)
early_check_vec5();
+ early_init_memory_block_size();
+
if (early_radix_enabled()) {
radix__early_init_devtree();
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index 4f12504fb405..705e8e8ffde4 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -41,7 +41,7 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
return __ioremap_caller(addr, size, prot, caller);
}
-void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags)
{
pte_t pte = __pte(flags);
void *caller = __builtin_return_address(0);
@@ -74,27 +74,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa,
return 0;
}
-
-void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
- pgprot_t prot, void *caller)
-{
- struct vm_struct *area;
- int ret;
- unsigned long va;
-
- area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller);
- if (area == NULL)
- return NULL;
-
- area->phys_addr = pa;
- va = (unsigned long)area->addr;
-
- ret = ioremap_page_range(va, va + size, pa, prot);
- if (!ret)
- return (void __iomem *)area->addr + offset;
-
- vunmap_range(va, va + size);
- free_vm_area(area);
-
- return NULL;
-}
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
index 9d13143b8be4..ca5bc6be3e6f 100644
--- a/arch/powerpc/mm/ioremap_32.c
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -22,6 +22,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
int err;
/*
+ * If the address lies within the first 16 MB, assume it's in ISA
+ * memory space
+ */
+ if (addr < SZ_16M)
+ addr += _ISA_MEM_BASE;
+
+ /*
* Choose an address to map it to.
* Once the vmalloc system is running, we use it.
* Before then, we use space going down from IOREMAP_TOP
@@ -31,13 +38,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
offset = addr & ~PAGE_MASK;
size = PAGE_ALIGN(addr + size) - p;
- /*
- * If the address lies within the first 16 MB, assume it's in ISA
- * memory space
- */
- if (p < 16 * 1024 * 1024)
- p += _ISA_MEM_BASE;
-
#ifndef CONFIG_CRASH_DUMP
/*
* Don't allow anybody to remap normal RAM that we're using.
@@ -63,7 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call
return (void __iomem *)v + offset;
if (slab_is_available())
- return do_ioremap(p, offset, size, prot, caller);
+ return generic_ioremap_prot(addr, size, prot);
/*
* Should check if it is a candidate for a BAT mapping
@@ -87,7 +87,6 @@ void iounmap(volatile void __iomem *addr)
if (v_block_mapped((unsigned long)addr))
return;
- if (addr > high_memory && (unsigned long)addr < ioremap_bot)
- vunmap((void *)(PAGE_MASK & (unsigned long)addr));
+ generic_iounmap(addr);
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index 3acece00b33e..d24e5f166723 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -29,7 +29,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
return NULL;
if (slab_is_available())
- return do_ioremap(paligned, offset, size, prot, caller);
+ return generic_ioremap_prot(addr, size, prot);
pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller);
@@ -49,17 +49,9 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
*/
void iounmap(volatile void __iomem *token)
{
- void *addr;
-
if (!slab_is_available())
return;
- addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK);
-
- if ((unsigned long)addr < ioremap_bot) {
- pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr);
- return;
- }
- vunmap(addr);
+ generic_iounmap(PCI_FIX_ADDR(token));
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 1fb9c99f8679..b24c19078eb1 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -43,11 +43,13 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
+ int cpu = smp_processor_id();
bool new_on_cpu = false;
/* Mark this context has been used on the new CPU */
- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
- cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+ if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ VM_WARN_ON_ONCE(next == &init_mm);
+ cpumask_set_cpu(cpu, mm_cpumask(next));
inc_mm_active_cpus(next);
/*
@@ -100,6 +102,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* sub architectures. Out of line for now
*/
switch_mmu_context(prev, next, tsk);
+
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(prev)));
}
#ifndef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index c6dccb4f06dc..7f9ff0640124 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -110,6 +110,7 @@ extern void MMU_init_hw(void);
void MMU_init_hw_patch(void);
unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
#endif
+void mmu_init_secondary(int cpu);
#ifdef CONFIG_PPC_E500
extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx,
diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
index 58c8d9849cb1..6b30e40d4590 100644
--- a/arch/powerpc/mm/nohash/e500_hugetlbpage.c
+++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
@@ -178,7 +178,8 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
*
* This must always be called with the pte lock held.
*/
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
+void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, unsigned int nr)
{
if (is_vm_hugetlb_page(vma))
book3e_hugetlb_preload(vma, address, *ptep);
diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c
index 552becf90e97..e1f7de2e54ec 100644
--- a/arch/powerpc/mm/nohash/kup.c
+++ b/arch/powerpc/mm/nohash/kup.c
@@ -5,7 +5,6 @@
#include <linux/export.h>
#include <linux/init.h>
-#include <linux/jump_label.h>
#include <linux/printk.h>
#include <linux/smp.h>
@@ -13,21 +12,18 @@
#include <asm/smp.h>
#ifdef CONFIG_PPC_KUAP
-struct static_key_false disable_kuap_key;
-EXPORT_SYMBOL(disable_kuap_key);
-
void setup_kuap(bool disabled)
{
if (disabled) {
if (IS_ENABLED(CONFIG_40x))
disable_kuep = true;
if (smp_processor_id() == boot_cpuid)
- static_branch_enable(&disable_kuap_key);
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_KUAP;
return;
}
pr_info("Activating Kernel Userspace Access Protection\n");
- __prevent_user_access(KUAP_READ_WRITE);
+ prevent_user_access(KUAP_READ_WRITE);
}
#endif
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index a903b308acc5..5ffa0af4328a 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -318,17 +318,6 @@ EXPORT_SYMBOL(flush_tlb_page);
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PPC_47x
-void __init early_init_mmu_47x(void)
-{
-#ifdef CONFIG_SMP
- unsigned long root = of_get_flat_dt_root();
- if (of_get_flat_dt_prop(root, "cooperative-partition", NULL))
- mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
-#endif /* CONFIG_SMP */
-}
-#endif /* CONFIG_PPC_47x */
-
/*
* Flush kernel TLB entries in the given range
*/
@@ -746,8 +735,10 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
#else /* ! CONFIG_PPC64 */
void __init early_init_mmu(void)
{
-#ifdef CONFIG_PPC_47x
- early_init_mmu_47x();
-#endif
+ unsigned long root = of_get_flat_dt_root();
+
+ if (IS_ENABLED(CONFIG_PPC_47x) && IS_ENABLED(CONFIG_SMP) &&
+ of_get_flat_dt_prop(root, "cooperative-partition", NULL))
+ mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST);
}
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9f73d089eac1..f6c4ace3b221 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -34,6 +34,7 @@
#include <asm/hvcall.h>
#include <asm/setup.h>
#include <asm/vdso.h>
+#include <asm/vphn.h>
#include <asm/drmem.h>
static int numa_enabled = 1;
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index 20652daa1d7e..8c31802f97e8 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -18,15 +18,15 @@
void pte_frag_destroy(void *pte_frag)
{
int count;
- struct page *page;
+ struct ptdesc *ptdesc;
- page = virt_to_page(pte_frag);
+ ptdesc = virt_to_ptdesc(pte_frag);
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
/* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) {
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
}
@@ -55,25 +55,25 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm)
static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
{
void *ret = NULL;
- struct page *page;
+ struct ptdesc *ptdesc;
if (!kernel) {
- page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
- if (!page)
+ ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
} else {
- page = alloc_page(PGALLOC_GFP);
- if (!page)
+ ptdesc = pagetable_alloc(PGALLOC_GFP, 0);
+ if (!ptdesc)
return NULL;
}
- atomic_set(&page->pt_frag_refcount, 1);
+ atomic_set(&ptdesc->pt_frag_refcount, 1);
- ret = page_address(page);
+ ret = ptdesc_address(ptdesc);
/*
* if we support only one fragment just return the
* allocated page.
@@ -82,12 +82,12 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
return ret;
spin_lock(&mm->page_table_lock);
/*
- * If we find pgtable_page set, we return
+ * If we find ptdesc_page set, we return
* the allocated page with single fragment
* count.
*/
if (likely(!pte_frag_get(&mm->context))) {
- atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR);
+ atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR);
pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE);
}
spin_unlock(&mm->page_table_lock);
@@ -106,17 +106,40 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel)
return __alloc_for_ptecache(mm, kernel);
}
-void pte_fragment_free(unsigned long *table, int kernel)
+static void pte_free_now(struct rcu_head *head)
{
- struct page *page = virt_to_page(table);
+ struct ptdesc *ptdesc;
- if (PageReserved(page))
- return free_reserved_page(page);
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- if (!kernel)
- pgtable_pte_page_dtor(page);
- __free_page(page);
+void pte_fragment_free(unsigned long *table, int kernel)
+{
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
+
+ if (pagetable_is_reserved(ptdesc))
+ return free_reserved_ptdesc(ptdesc);
+
+ BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
+ if (kernel)
+ pagetable_free(ptdesc);
+ else if (folio_test_clear_active(ptdesc_folio(ptdesc)))
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+ else
+ pte_free_now(&ptdesc->pt_rcu_head);
}
}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct page *page;
+
+ page = virt_to_page(pgtable);
+ SetPageActive(page);
+ pte_fragment_free((unsigned long *)pgtable, 0);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index cb2dcdb18f8e..3f86fd217690 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -58,7 +58,7 @@ static inline int pte_looks_normal(pte_t pte)
return 0;
}
-static struct page *maybe_pte_to_page(pte_t pte)
+static struct folio *maybe_pte_to_folio(pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
struct page *page;
@@ -68,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte)
page = pfn_to_page(pfn);
if (PageReserved(page))
return NULL;
- return page;
+ return page_folio(page);
}
#ifdef CONFIG_PPC_BOOK3S
@@ -84,12 +84,12 @@ static pte_t set_pte_filter_hash(pte_t pte)
pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) ||
cpu_has_feature(CPU_FTR_NOEXECUTE))) {
- struct page *pg = maybe_pte_to_page(pte);
- if (!pg)
+ struct folio *folio = maybe_pte_to_folio(pte);
+ if (!folio)
return pte;
- if (!test_bit(PG_dcache_clean, &pg->flags)) {
- flush_dcache_icache_page(pg);
- set_bit(PG_dcache_clean, &pg->flags);
+ if (!test_bit(PG_dcache_clean, &folio->flags)) {
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
}
}
return pte;
@@ -107,7 +107,7 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
*/
static inline pte_t set_pte_filter(pte_t pte)
{
- struct page *pg;
+ struct folio *folio;
if (radix_enabled())
return pte;
@@ -120,18 +120,18 @@ static inline pte_t set_pte_filter(pte_t pte)
return pte;
/* If you set _PAGE_EXEC on weird pages you're on your own */
- pg = maybe_pte_to_page(pte);
- if (unlikely(!pg))
+ folio = maybe_pte_to_folio(pte);
+ if (unlikely(!folio))
return pte;
/* If the page clean, we move on */
- if (test_bit(PG_dcache_clean, &pg->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags))
return pte;
/* If it's an exec fault, we flush the cache and make it clean */
if (is_exec_fault()) {
- flush_dcache_icache_page(pg);
- set_bit(PG_dcache_clean, &pg->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
return pte;
}
@@ -142,7 +142,7 @@ static inline pte_t set_pte_filter(pte_t pte)
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
int dirty)
{
- struct page *pg;
+ struct folio *folio;
if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
return pte;
@@ -168,17 +168,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
#endif /* CONFIG_DEBUG_VM */
/* If you set _PAGE_EXEC on weird pages you're on your own */
- pg = maybe_pte_to_page(pte);
- if (unlikely(!pg))
+ folio = maybe_pte_to_folio(pte);
+ if (unlikely(!folio))
goto bail;
/* If the page is already clean, we move on */
- if (test_bit(PG_dcache_clean, &pg->flags))
+ if (test_bit(PG_dcache_clean, &folio->flags))
goto bail;
/* Clean the page and set PG_dcache_clean */
- flush_dcache_icache_page(pg);
- set_bit(PG_dcache_clean, &pg->flags);
+ flush_dcache_icache_folio(folio);
+ set_bit(PG_dcache_clean, &folio->flags);
bail:
return pte_mkexec(pte);
@@ -187,8 +187,8 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
/*
* set_pte stores a linux PTE into the linux page table.
*/
-void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
- pte_t pte)
+void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte, unsigned int nr)
{
/*
* Make sure hardware valid bit is not set. We don't do
@@ -203,7 +203,16 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte = set_pte_filter(pte);
/* Perform the setting of the PTE */
- __set_pte_at(mm, addr, ptep, pte, 0);
+ arch_enter_lazy_mmu_mode();
+ for (;;) {
+ __set_pte_at(mm, addr, ptep, pte, 0);
+ if (--nr == 0)
+ break;
+ ptep++;
+ pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT));
+ addr += PAGE_SIZE;
+ }
+ arch_leave_lazy_mmu_mode();
}
void unmap_kernel_page(unsigned long va)
@@ -311,6 +320,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
if (mm == &init_mm)
return;
@@ -329,8 +340,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
*/
if (pmd_none(*pmd))
return;
- BUG_ON(!pmd_present(*pmd));
- assert_spin_locked(pte_lockptr(mm, pmd));
+ pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
+ BUG_ON(!pte);
+ assert_spin_locked(ptl);
+ pte_unmap(pte);
}
#endif /* CONFIG_DEBUG_VM */