From eca499ab3749a4537dee77ffead47a1a2c0dee19 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 16 Jul 2019 16:30:31 -0700 Subject: mm/hotplug: make remove_memory() interface usable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Presently the remove_memory() interface is inherently broken. It tries to remove memory but panics if some memory is not offline. The problem is that it is impossible to ensure that all memory blocks are offline as this function also takes lock_device_hotplug that is required to change memory state via sysfs. So, between calling this function and offlining all memory blocks there is always a window when lock_device_hotplug is released, and therefore, there is always a chance for a panic during this window. Make this interface to return an error if memory removal fails. This way it is safe to call this function without panicking machine, and also makes it symmetric to add_memory() which already returns an error. Link: http://lkml.kernel.org/r/20190517215438.6487-3-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: Dave Hansen Cc: Dave Jiang Cc: Fengguang Wu Cc: Huang Ying Cc: James Morris Cc: Jérôme Glisse Cc: Keith Busch Cc: Ross Zwisler Cc: Sasha Levin Cc: Takashi Iwai Cc: Tom Lendacky Cc: Vishal Verma Cc: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ae892eef8b82..988fde33cd7f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -324,7 +324,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); -extern void remove_memory(int nid, u64 start, u64 size); +extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); #else @@ -341,7 +341,11 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) return -EINVAL; } -static inline void remove_memory(int nid, u64 start, u64 size) {} +static inline int remove_memory(int nid, u64 start, u64 size) +{ + return -EBUSY; +} + static inline void __remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit From 80ec922dbd87fd38d15719c86a94457204648aeb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:51 -0700 Subject: mm/memory_hotplug: allow arch_remove_memory() without CONFIG_MEMORY_HOTREMOVE We want to improve error handling while adding memory by allowing to use arch_remove_memory() and __remove_pages() even if CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like: arch_add_memory() rc = do_something(); if (rc) { arch_remove_memory(); } We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require quite some dependencies for memory offlining. Link: http://lkml.kernel.org/r/20190527111152.16324-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Pavel Tatashin Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: David Hildenbrand Cc: Oscar Salvador Cc: "Kirill A. Shutemov" Cc: Alex Deucher Cc: "David S. Miller" Cc: Mark Brown Cc: Chris Wilson Cc: Christophe Leroy Cc: Nicholas Piggin Cc: Vasily Gorbik Cc: Rob Herring Cc: Masahiro Yamada Cc: "mike.travis@hpe.com" Cc: Andrew Banman Cc: Arun KS Cc: Qian Cai Cc: Mathieu Malaterre Cc: Baoquan He Cc: Logan Gunthorpe Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Chintan Pandya Cc: Dan Williams Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: Mark Rutland Cc: Mike Rapoport Cc: Oscar Salvador Cc: Robin Murphy Cc: Wei Yang Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 2 -- arch/ia64/mm/init.c | 2 -- arch/powerpc/mm/mem.c | 2 -- arch/s390/mm/init.c | 2 -- arch/sh/mm/init.c | 2 -- arch/x86/mm/init_32.c | 2 -- arch/x86/mm/init_64.c | 2 -- drivers/base/memory.c | 2 -- include/linux/memory.h | 2 -- include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 2 -- mm/sparse.c | 6 ------ 12 files changed, 28 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a21fa7e1167d..750a69dde39b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1074,7 +1074,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -1093,4 +1092,3 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d28e29103bdb..aae75fd7b810 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -681,7 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -693,4 +692,3 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 26a8da3723bb..9259337d7374 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -125,7 +125,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void __ref arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -151,7 +150,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, pr_warn("Hash collision while resizing HPT\n"); } #endif -#endif /* CONFIG_MEMORY_HOTPLUG */ #ifndef CONFIG_NEED_MULTIPLE_NODES void __init mem_topology_setup(void) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 5b1ec2f532e0..4e5bbe328594 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -286,7 +286,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return rc; } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -298,5 +297,4 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); vmem_remove_mapping(start, size); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 13c6a6bb5fd9..dfdbaa50946e 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -429,7 +429,6 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -440,5 +439,4 @@ void arch_remove_memory(int nid, u64 start, u64 size, zone = page_zone(pfn_to_page(start_pfn)); __remove_pages(zone, start_pfn, nr_pages, altmap); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f265a4316179..4068abb9427f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -860,7 +860,6 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, restrictions); } -#ifdef CONFIG_MEMORY_HOTREMOVE void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { @@ -872,7 +871,6 @@ void arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif -#endif int kernel_set_to_readonly __read_mostly; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 08bbf648827b..5a289a2ab108 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1198,7 +1198,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, remove_pagetable(start, end, false, altmap); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void __meminit kernel_physical_mapping_remove(unsigned long start, unsigned long end) { @@ -1219,7 +1218,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } -#endif #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e0aa7f9abb36..92459d6f12be 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -723,7 +723,6 @@ out: return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE static void unregister_memory(struct memory_block *memory) { @@ -762,7 +761,6 @@ void unregister_memory_section(struct mem_section *section) out_unlock: mutex_unlock(&mem_sysfs_mutex); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ /* return true if the memory block is offlined, otherwise, return false */ bool is_memblock_offlined(struct memory_block *mem) diff --git a/include/linux/memory.h b/include/linux/memory.h index e1dc1bb2b787..474c7c60c8f2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -112,9 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int hotplug_memory_register(int nid, struct mem_section *section); -#ifdef CONFIG_MEMORY_HOTREMOVE extern void unregister_memory_section(struct mem_section *); -#endif extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 988fde33cd7f..87bf9c4a889e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -123,12 +123,10 @@ static inline bool movable_node_is_enabled(void) return movable_node_enabled; } -#ifdef CONFIG_MEMORY_HOTREMOVE extern void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); extern void __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -#endif /* CONFIG_MEMORY_HOTREMOVE */ /* * Do we want sysfs memblock files created. This will allow userspace to online diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a8c25fd85ee3..bc11888d5d7e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -318,7 +318,6 @@ out: return err; } -#ifdef CONFIG_MEMORY_HOTREMOVE /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, @@ -580,7 +579,6 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, set_zone_contiguous(zone); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) { diff --git a/mm/sparse.c b/mm/sparse.c index fd13166949b5..d1d5e05f5b8d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -604,7 +604,6 @@ static void __kfree_section_memmap(struct page *memmap, vmemmap_free(start, end, altmap); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; @@ -612,7 +611,6 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #else static struct page *__kmalloc_section_memmap(void) { @@ -651,7 +649,6 @@ static void __kfree_section_memmap(struct page *memmap, get_order(sizeof(struct page) * PAGES_PER_SECTION)); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; @@ -681,7 +678,6 @@ static void free_map_bootmem(struct page *memmap) put_page_bootmem(page); } } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ /** @@ -746,7 +742,6 @@ out: return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_FAILURE static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { @@ -823,5 +818,4 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, PAGES_PER_SECTION - map_offset); free_section_usemap(memmap, usemap, altmap); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit From 05f800a0bd08e14606ac63e0a5c63ed6880acaab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:01 -0700 Subject: mm/memory_hotplug: drop MHP_MEMBLOCK_API No longer needed, the callers of arch_add_memory() can handle this manually. Link: http://lkml.kernel.org/r/20190527111152.16324-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Joonsoo Kim Cc: Qian Cai Cc: Arun KS Cc: Mathieu Malaterre Cc: Mike Rapoport Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 8 -------- mm/memory_hotplug.c | 9 +++------ 2 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 87bf9c4a889e..36c514b80cf1 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -128,14 +128,6 @@ extern void arch_remove_memory(int nid, u64 start, u64 size, extern void __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -/* - * Do we want sysfs memblock files created. This will allow userspace to online - * and offline memory explicitly. Lack of this bit means that the caller has to - * call move_pfn_range_to_zone to finish the initialization. - */ - -#define MHP_MEMBLOCK_API (1<<0) - /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_restrictions *restrictions); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 78291526eb4d..fb9dc3fa1138 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -251,7 +251,7 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, - struct vmem_altmap *altmap, bool want_memblock) + struct vmem_altmap *altmap) { int ret; @@ -294,8 +294,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, section_nr_to_pfn(i), altmap, - restrictions->flags & MHP_MEMBLOCK_API); + err = __add_section(nid, section_nr_to_pfn(i), altmap); /* * EEXIST is finally dealt with by ioresource collision @@ -1065,9 +1064,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { - struct mhp_restrictions restrictions = { - .flags = MHP_MEMBLOCK_API, - }; + struct mhp_restrictions restrictions = {}; u64 start, size; bool new_node = false; int ret; -- cgit From b9bf8d342d9b443c0d19aa57883d8ddb38d965de Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:17 -0700 Subject: mm/memory_hotplug: remove "zone" parameter from sparse_remove_one_section The parameter is unused, so let's drop it. Memory removal paths should never care about zones. This is the job of memory offlining and will require more refactorings. Link: http://lkml.kernel.org/r/20190527111152.16324-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 2 +- mm/sparse.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 36c514b80cf1..79e0add6a597 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -350,7 +350,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_one_section(int nid, unsigned long start_pfn, struct vmem_altmap *altmap); -extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, +extern void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 37c861e7a717..d1d0ceaaca88 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -524,7 +524,7 @@ static void __remove_section(struct zone *zone, struct mem_section *ms, start_pfn = section_nr_to_pfn((unsigned long)scn_nr); __remove_zone(zone, start_pfn); - sparse_remove_one_section(zone, ms, map_offset, altmap); + sparse_remove_one_section(ms, map_offset, altmap); } /** diff --git a/mm/sparse.c b/mm/sparse.c index d1d5e05f5b8d..1552c855d62a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -800,8 +800,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap, free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, struct vmem_altmap *altmap) +void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, + struct vmem_altmap *altmap) { struct page *memmap = NULL; unsigned long *usemap = NULL; -- cgit From fbcf73ce65827c3d8935f38b832a43153a0c78d1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:46 -0700 Subject: mm/memory_hotplug: rename walk_memory_range() and pass start+size instead of pfns walk_memory_range() was once used to iterate over sections. Now, it iterates over memory blocks. Rename the function, fixup the documentation. Also, pass start+size instead of PFNs, which is what most callers already have at hand. (we'll rework link_mem_sections() most probably soon) Follow-up patches will rework, simplify, and move walk_memory_blocks() to drivers/base/memory.c. Note: walk_memory_blocks() only works correctly right now if the start_pfn is aligned to a section start. This is the case right now, but we'll generalize the function in a follow up patch so the semantics match the documentation. [akpm@linux-foundation.org: remove unused variable] Link: http://lkml.kernel.org/r/20190614100114.311-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Greg Kroah-Hartman Cc: David Hildenbrand Cc: Rashmica Gupta Cc: Pavel Tatashin Cc: Anshuman Khandual Cc: Michael Neuling Cc: Thomas Gleixner Cc: Oscar Salvador Cc: Michal Hocko Cc: Wei Yang Cc: Juergen Gross Cc: Qian Cai Cc: Arun KS Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/powernv/memtrace.c | 23 +++++++++++------------ drivers/acpi/acpi_memhotplug.c | 19 ++++--------------- drivers/base/node.c | 5 +++-- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 24 +++++++++++++----------- 5 files changed, 32 insertions(+), 41 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 5e53c1392d3b..eb2e75dac369 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -70,23 +70,23 @@ static int change_memblock_state(struct memory_block *mem, void *arg) /* called with device_hotplug_lock held */ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) { - u64 end_pfn = start_pfn + nr_pages - 1; + const unsigned long start = PFN_PHYS(start_pfn); + const unsigned long size = PFN_PHYS(nr_pages); - if (walk_memory_range(start_pfn, end_pfn, NULL, - check_memblock_online)) + if (walk_memory_blocks(start, size, NULL, check_memblock_online)) return false; - walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE, - change_memblock_state); + walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE, + change_memblock_state); if (offline_pages(start_pfn, nr_pages)) { - walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE, - change_memblock_state); + walk_memory_blocks(start, size, (void *)MEM_ONLINE, + change_memblock_state); return false; } - walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, - change_memblock_state); + walk_memory_blocks(start, size, (void *)MEM_OFFLINE, + change_memblock_state); return true; @@ -242,9 +242,8 @@ static int memtrace_online(void) */ if (!memhp_auto_online) { lock_device_hotplug(); - walk_memory_range(PFN_DOWN(ent->start), - PFN_UP(ent->start + ent->size - 1), - NULL, online_mem_block); + walk_memory_blocks(ent->start, ent->size, NULL, + online_mem_block); unlock_device_hotplug(); } diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index db013dc21c02..e294f44a7850 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device) return 0; } -static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info) -{ - return PFN_DOWN(info->start_addr); -} - -static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info) -{ - return PFN_UP(info->start_addr + info->length-1); -} - static int acpi_bind_memblk(struct memory_block *mem, void *arg) { return acpi_bind_one(&mem->dev, arg); @@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, void *arg) static int acpi_bind_memory_blocks(struct acpi_memory_info *info, struct acpi_device *adev) { - return walk_memory_range(acpi_meminfo_start_pfn(info), - acpi_meminfo_end_pfn(info), adev, - acpi_bind_memblk); + return walk_memory_blocks(info->start_addr, info->length, adev, + acpi_bind_memblk); } static int acpi_unbind_memblk(struct memory_block *mem, void *arg) @@ -186,8 +175,8 @@ static int acpi_unbind_memblk(struct memory_block *mem, void *arg) static void acpi_unbind_memory_blocks(struct acpi_memory_info *info) { - walk_memory_range(acpi_meminfo_start_pfn(info), - acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk); + walk_memory_blocks(info->start_addr, info->length, NULL, + acpi_unbind_memblk); } static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) diff --git a/drivers/base/node.c b/drivers/base/node.c index 27391f1e8f60..75b7e6f6535b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -834,8 +834,9 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) { - return walk_memory_range(start_pfn, end_pfn, (void *)&nid, - register_mem_sect_under_node); + return walk_memory_blocks(PFN_PHYS(start_pfn), + PFN_PHYS(end_pfn - start_pfn), (void *)&nid, + register_mem_sect_under_node); } #ifdef CONFIG_HUGETLBFS diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 79e0add6a597..d9fffc34949f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -340,7 +340,7 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ extern void __ref free_area_init_core_hotplug(int nid); -extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, +extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, int (*func)(struct memory_block *, void *)); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d1d0ceaaca88..b3ef84e408fa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1124,8 +1124,7 @@ int __ref add_memory_resource(int nid, struct resource *res) /* online pages if requested */ if (memhp_auto_online) - walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), - NULL, online_memory_block); + walk_memory_blocks(start, size, NULL, online_memory_block); return ret; error: @@ -1663,20 +1662,24 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) #endif /* CONFIG_MEMORY_HOTREMOVE */ /** - * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) - * @start_pfn: start pfn of the memory range - * @end_pfn: end pfn of the memory range + * walk_memory_blocks - walk through all present memory blocks overlapped + * by the range [start, start + size) + * + * @start: start address of the memory range + * @size: size of the memory range * @arg: argument passed to func - * @func: callback for each memory section walked + * @func: callback for each memory block walked * - * This function walks through all present mem sections in range - * [start_pfn, end_pfn) and call func on each mem section. + * This function walks through all present memory blocks overlapped by the + * range [start, start + size), calling func on each memory block. * * Returns the return value of func. */ -int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, +int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, int (*func)(struct memory_block *, void *)) { + const unsigned long start_pfn = PFN_DOWN(start); + const unsigned long end_pfn = PFN_UP(start + size - 1); struct memory_block *mem = NULL; struct mem_section *section; unsigned long pfn, section_nr; @@ -1822,8 +1825,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) * whether all memory blocks in question are offline and return error * if this is not the case. */ - rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, - check_memblock_offlined_cb); + rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb); if (rc) goto done; -- cgit From ea8846411ad686ff626e00bb2c3821b3db2ab56a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:50 -0700 Subject: mm/memory_hotplug: move and simplify walk_memory_blocks() Let's move walk_memory_blocks() to the place where memory block logic resides and simplify it. While at it, add a type for the callback function. Link: http://lkml.kernel.org/r/20190614100114.311-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Andrew Banman Cc: Mike Travis Cc: Oscar Salvador Cc: Michal Hocko Cc: Wei Yang Cc: Arun KS Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 42 ++++++++++++++++++++++++++++++++ include/linux/memory.h | 3 +++ include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 55 ------------------------------------------ 4 files changed, 45 insertions(+), 57 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c54e80fd25a8..0204384b4d1d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -44,6 +44,11 @@ static inline unsigned long pfn_to_block_id(unsigned long pfn) return base_memory_block_id(pfn_to_section_nr(pfn)); } +static inline unsigned long phys_to_block_id(unsigned long phys) +{ + return pfn_to_block_id(PFN_DOWN(phys)); +} + static int memory_subsys_online(struct device *dev); static int memory_subsys_offline(struct device *dev); @@ -851,3 +856,40 @@ out: printk(KERN_ERR "%s() failed: %d\n", __func__, ret); return ret; } + +/** + * walk_memory_blocks - walk through all present memory blocks overlapped + * by the range [start, start + size) + * + * @start: start address of the memory range + * @size: size of the memory range + * @arg: argument passed to func + * @func: callback for each memory section walked + * + * This function walks through all present memory blocks overlapped by the + * range [start, start + size), calling func on each memory block. + * + * In case func() returns an error, walking is aborted and the error is + * returned. + */ +int walk_memory_blocks(unsigned long start, unsigned long size, + void *arg, walk_memory_blocks_func_t func) +{ + const unsigned long start_block_id = phys_to_block_id(start); + const unsigned long end_block_id = phys_to_block_id(start + size - 1); + struct memory_block *mem; + unsigned long block_id; + int ret = 0; + + for (block_id = start_block_id; block_id <= end_block_id; block_id++) { + mem = find_memory_block_by_id(block_id, NULL); + if (!mem) + continue; + + ret = func(mem, arg); + put_device(&mem->dev); + if (ret) + break; + } + return ret; +} diff --git a/include/linux/memory.h b/include/linux/memory.h index f26a5417ec5d..b3b388775a30 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -119,6 +119,9 @@ extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block_hinted(struct mem_section *, struct memory_block *); extern struct memory_block *find_memory_block(struct mem_section *); +typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); +extern int walk_memory_blocks(unsigned long start, unsigned long size, + void *arg, walk_memory_blocks_func_t func); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<= mem->start_section_nr) && - (section_nr <= mem->end_section_nr)) - continue; - - mem = find_memory_block_hinted(section, mem); - if (!mem) - continue; - - ret = func(mem, arg); - if (ret) { - kobject_put(&mem->dev.kobj); - return ret; - } - } - - if (mem) - kobject_put(&mem->dev.kobj); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTREMOVE static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); -- cgit From 7ea6216049ff9cf250a6722cd766d99c8d1424e5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:22 -0700 Subject: mm/sparsemem: prepare for sub-section ranges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare the memory hot-{add,remove} paths for handling sub-section ranges by plumbing the starting page frame and number of pages being handled through arch_{add,remove}_memory() to sparse_{add,remove}_one_section(). This is simply plumbing, small cleanups, and some identifier renames. No intended functional changes. Link: http://lkml.kernel.org/r/156092353780.979959.9713046515562743194.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Pavel Tatashin Tested-by: Aneesh Kumar K.V [ppc64] Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 5 +- mm/memory_hotplug.c | 114 +++++++++++++++++++++++++---------------- mm/sparse.c | 16 +++--- 3 files changed, 81 insertions(+), 54 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 475aff8efbf8..2d636a7491a4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -346,9 +346,10 @@ extern int add_memory_resource(int nid, struct resource *resource); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern bool is_memblock_offlined(struct memory_block *mem); -extern int sparse_add_one_section(int nid, unsigned long start_pfn, - struct vmem_altmap *altmap); +extern int sparse_add_section(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap); extern void sparse_remove_one_section(struct mem_section *ms, + unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 11220044b01a..3fbb2cfab126 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -252,51 +252,84 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static int __meminit __add_section(int nid, unsigned long phys_start_pfn, - struct vmem_altmap *altmap) +static int __meminit __add_section(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { int ret; - if (pfn_valid(phys_start_pfn)) + if (pfn_valid(pfn)) return -EEXIST; - ret = sparse_add_one_section(nid, phys_start_pfn, altmap); + ret = sparse_add_section(nid, pfn, nr_pages, altmap); return ret < 0 ? ret : 0; } +static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, + const char *reason) +{ + /* + * Disallow all operations smaller than a sub-section and only + * allow operations smaller than a section for + * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() + * enforces a larger memory_block_size_bytes() granularity for + * memory that will be marked online, so this check should only + * fire for direct arch_{add,remove}_memory() users outside of + * add_memory_resource(). + */ + unsigned long min_align; + + if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + min_align = PAGES_PER_SUBSECTION; + else + min_align = PAGES_PER_SECTION; + if (!IS_ALIGNED(pfn, min_align) + || !IS_ALIGNED(nr_pages, min_align)) { + WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n", + reason, pfn, pfn + nr_pages - 1); + return -EINVAL; + } + return 0; +} + /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct mhp_restrictions *restrictions) +int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, + struct mhp_restrictions *restrictions) { unsigned long i; - int err = 0; - int start_sec, end_sec; + int start_sec, end_sec, err; struct vmem_altmap *altmap = restrictions->altmap; - /* during initialize mem_map, align hot-added range to section */ - start_sec = pfn_to_section_nr(phys_start_pfn); - end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); - if (altmap) { /* * Validate altmap is within bounds of the total request */ - if (altmap->base_pfn != phys_start_pfn + if (altmap->base_pfn != pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); - err = -EINVAL; - goto out; + return -EINVAL; } altmap->alloc = 0; } + err = check_pfn_span(pfn, nr_pages, "add"); + if (err) + return err; + + start_sec = pfn_to_section_nr(pfn); + end_sec = pfn_to_section_nr(pfn + nr_pages - 1); for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, section_nr_to_pfn(i), altmap); + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + err = __add_section(nid, pfn, pfns, altmap); + pfn += pfns; + nr_pages -= pfns; /* * EEXIST is finally dealt with by ioresource collision @@ -309,7 +342,6 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, cond_resched(); } vmemmap_populate_print_last(); -out: return err; } @@ -487,10 +519,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, pgdat->node_spanned_pages = 0; } -static void __remove_zone(struct zone *zone, unsigned long start_pfn) +static void __remove_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) { struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; unsigned long flags; pgdat_resize_lock(zone->zone_pgdat, &flags); @@ -499,27 +531,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static void __remove_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, - struct vmem_altmap *altmap) +static void __remove_section(struct zone *zone, unsigned long pfn, + unsigned long nr_pages, unsigned long map_offset, + struct vmem_altmap *altmap) { - unsigned long start_pfn; - int scn_nr; + struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); if (WARN_ON_ONCE(!valid_section(ms))) return; - scn_nr = __section_nr(ms); - start_pfn = section_nr_to_pfn((unsigned long)scn_nr); - __remove_zone(zone, start_pfn); - - sparse_remove_one_section(ms, map_offset, altmap); + __remove_zone(zone, pfn, nr_pages); + sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap); } /** * __remove_pages() - remove sections of pages from a zone * @zone: zone from which pages need to be removed - * @phys_start_pfn: starting pageframe (must be aligned to start of a section) + * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used * @@ -528,30 +556,30 @@ static void __remove_section(struct zone *zone, struct mem_section *ms, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, +void __remove_pages(struct zone *zone, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - unsigned long i; unsigned long map_offset = 0; - int sections_to_remove; + int i, start_sec, end_sec; map_offset = vmem_altmap_offset(altmap); clear_zone_contiguous(zone); - /* - * We can only remove entire sections - */ - BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); - BUG_ON(nr_pages % PAGES_PER_SECTION); + if (check_pfn_span(pfn, nr_pages, "remove")) + return; - sections_to_remove = nr_pages / PAGES_PER_SECTION; - for (i = 0; i < sections_to_remove; i++) { - unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + start_sec = pfn_to_section_nr(pfn); + end_sec = pfn_to_section_nr(pfn + nr_pages - 1); + for (i = start_sec; i <= end_sec; i++) { + unsigned long pfns; cond_resched(); - __remove_section(zone, __pfn_to_section(pfn), map_offset, - altmap); + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + __remove_section(zone, pfn, pfns, map_offset, altmap); + pfn += pfns; + nr_pages -= pfns; map_offset = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 6b01022e23a9..41579b66fff1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -728,8 +728,8 @@ static void free_map_bootmem(struct page *memmap) * * -EEXIST - Section has been present. * * -ENOMEM - Out of memory. */ -int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, - struct vmem_altmap *altmap) +int __meminit sparse_add_section(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct mem_section_usage *usage; @@ -835,8 +835,9 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap, free_map_bootmem(memmap); } -void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, - struct vmem_altmap *altmap) +void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn, + unsigned long nr_pages, unsigned long map_offset, + struct vmem_altmap *altmap) { struct page *memmap = NULL; struct mem_section_usage *usage = NULL; @@ -849,10 +850,7 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset, ms->usage = NULL; } - clear_hwpoisoned_pages(memmap + map_offset, - PAGES_PER_SECTION - map_offset); - free_section_usage(ms, memmap, usage, - section_nr_to_pfn(__section_nr(ms)), - PAGES_PER_SECTION, altmap); + clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset); + free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit From ba72b4c8cf60e452cf6f0258ed9ee697957b7dfd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:26 -0700 Subject: mm/sparsemem: support sub-section hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The libnvdimm sub-system has suffered a series of hacks and broken workarounds for the memory-hotplug implementation's awkward section-aligned (128MB) granularity. For example the following backtrace is emitted when attempting arch_add_memory() with physical address ranges that intersect 'System RAM' (RAM) with 'Persistent Memory' (PMEM) within a given section: # cat /proc/iomem | grep -A1 -B1 Persistent\ Memory 100000000-1ffffffff : System RAM 200000000-303ffffff : Persistent Memory (legacy) 304000000-43fffffff : System RAM 440000000-23ffffffff : Persistent Memory 2400000000-43bfffffff : Persistent Memory 2400000000-43bfffffff : namespace2.0 WARNING: CPU: 38 PID: 928 at arch/x86/mm/init_64.c:850 add_pages+0x5c/0x60 [..] RIP: 0010:add_pages+0x5c/0x60 [..] Call Trace: devm_memremap_pages+0x460/0x6e0 pmem_attach_disk+0x29e/0x680 [nd_pmem] ? nd_dax_probe+0xfc/0x120 [libnvdimm] nvdimm_bus_probe+0x66/0x160 [libnvdimm] It was discovered that the problem goes beyond RAM vs PMEM collisions as some platform produce PMEM vs PMEM collisions within a given section. The libnvdimm workaround for that case revealed that the libnvdimm section-alignment-padding implementation has been broken for a long while. A fix for that long-standing breakage introduces as many problems as it solves as it would require a backward-incompatible change to the namespace metadata interpretation. Instead of that dubious route [1], address the root problem in the memory-hotplug implementation. Note that EEXIST is no longer treated as success as that is how sparse_add_section() reports subsection collisions, it was also obviated by recent changes to perform the request_region() for 'System RAM' before arch_add_memory() in the add_memory() sequence. [1] https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com [osalvador@suse.de: fix deactivate_section for early sections] Link: http://lkml.kernel.org/r/20190715081549.32577-2-osalvador@suse.de Link: http://lkml.kernel.org/r/156092354368.979959.6232443923440952359.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Oscar Salvador Tested-by: Aneesh Kumar K.V [ppc64] Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: Pavel Tatashin Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Toshi Kani Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 27 +----- mm/page_alloc.c | 2 +- mm/sparse.c | 206 +++++++++++++++++++++++++++-------------- 4 files changed, 141 insertions(+), 96 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2d636a7491a4..f46ea71b4ffd 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -348,7 +348,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -extern void sparse_remove_one_section(struct mem_section *ms, +extern void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3fbb2cfab126..aafb71594ee3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -252,18 +252,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static int __meminit __add_section(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) -{ - int ret; - - if (pfn_valid(pfn)) - return -EEXIST; - - ret = sparse_add_section(nid, pfn, nr_pages, altmap); - return ret < 0 ? ret : 0; -} - static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, const char *reason) { @@ -327,18 +315,11 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, pfns = min(nr_pages, PAGES_PER_SECTION - (pfn & ~PAGE_SECTION_MASK)); - err = __add_section(nid, pfn, pfns, altmap); + err = sparse_add_section(nid, pfn, pfns, altmap); + if (err) + break; pfn += pfns; nr_pages -= pfns; - - /* - * EEXIST is finally dealt with by ioresource collision - * check. see add_memory() => register_memory_resource() - * Warning will be printed if there is collision. - */ - if (err && (err != -EEXIST)) - break; - err = 0; cond_resched(); } vmemmap_populate_print_last(); @@ -541,7 +522,7 @@ static void __remove_section(struct zone *zone, unsigned long pfn, return; __remove_zone(zone, pfn, nr_pages); - sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap); + sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c74367a8eba..272c6de1bf4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5974,7 +5974,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * pfn out of zone. * * Please note that MEMMAP_HOTPLUG path doesn't clear memmap - * because this is done early in sparse_add_one_section + * because this is done early in section_activate() */ if (!(pfn & (pageblock_nr_pages - 1))) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); diff --git a/mm/sparse.c b/mm/sparse.c index 41579b66fff1..a205a2ac66a4 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -83,8 +83,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; + /* + * An existing section is possible in the sub-section hotplug + * case. First hot-add instantiates, follow-on hot-add reuses + * the existing section. + * + * The mem_hotplug_lock resolves the apparent race below. + */ if (mem_section[root]) - return -EEXIST; + return 0; section = sparse_index_alloc(nid); if (!section) @@ -715,10 +722,120 @@ static void free_map_bootmem(struct page *memmap) } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +static void section_deactivate(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + bool section_is_early = early_section(ms); + struct page *memmap = NULL; + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return; + + /* + * There are 3 cases to handle across two configurations + * (SPARSEMEM_VMEMMAP={y,n}): + * + * 1/ deactivation of a partial hot-added section (only possible + * in the SPARSEMEM_VMEMMAP=y case). + * a/ section was present at memory init + * b/ section was hot-added post memory init + * 2/ deactivation of a complete hot-added section + * 3/ deactivation of a complete section from memory init + * + * For 1/, when subsection_map does not empty we will not be + * freeing the usage map, but still need to free the vmemmap + * range. + * + * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified + */ + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + if (!section_is_early) { + kfree(ms->usage); + ms->usage = NULL; + } + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); + } + + if (section_is_early && memmap) + free_map_bootmem(memmap); + else + depopulate_section_memmap(pfn, nr_pages, altmap); +} + +static struct page * __meminit section_activate(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + struct mem_section_usage *usage = NULL; + unsigned long *subsection_map; + struct page *memmap; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + if (!ms->usage) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) + return ERR_PTR(-ENOMEM); + ms->usage = usage; + } + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + if (rc) { + if (usage) + ms->usage = NULL; + kfree(usage); + return ERR_PTR(rc); + } + + /* + * The early init code does not consider partially populated + * initial sections, it simply assumes that memory will never be + * referenced. If we hot-add memory into such a section then we + * do not need to populate the memmap and can simply reuse what + * is already there. + */ + if (nr_pages < PAGES_PER_SECTION && early_section(ms)) + return pfn_to_page(pfn); + + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap); + if (!memmap) { + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); + } + + return memmap; +} + /** - * sparse_add_one_section - add a memory section + * sparse_add_section - add a memory section, or populate an existing one * @nid: The node to add section on * @start_pfn: start pfn of the memory range + * @nr_pages: number of pfns to add in the section * @altmap: device page map * * This is only intended for hotplug. @@ -732,51 +849,34 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct mem_section_usage *usage; struct mem_section *ms; struct page *memmap; int ret; - /* - * no locking for this, because it does its own - * plus, it does a kmalloc - */ ret = sparse_index_init(section_nr, nid); - if (ret < 0 && ret != -EEXIST) + if (ret < 0) return ret; - ret = 0; - memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid, - altmap); - if (!memmap) - return -ENOMEM; - usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); - if (!usage) { - depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap); - return -ENOMEM; - } - ms = __pfn_to_section(start_pfn); - if (ms->section_mem_map & SECTION_MARKED_PRESENT) { - ret = -EEXIST; - goto out; - } + memmap = section_activate(nid, start_pfn, nr_pages, altmap); + if (IS_ERR(memmap)) + return PTR_ERR(memmap); /* * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); + page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + ms = __pfn_to_section(start_pfn); set_section_nid(section_nr, nid); section_mark_present(ms); - sparse_init_one_section(ms, section_nr, memmap, usage, 0); -out: - if (ret < 0) { - kfree(usage); - depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap); - } - return ret; + /* Align memmap to section boundary in the subsection case */ + if (section_nr_to_pfn(section_nr) != start_pfn) + memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr)); + sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); + + return 0; } #ifdef CONFIG_MEMORY_FAILURE @@ -809,48 +909,12 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -static void free_section_usage(struct mem_section *ms, struct page *memmap, - struct mem_section_usage *usage, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) -{ - if (!usage) - return; - - /* - * Check to see if allocation came from hot-plug-add - */ - if (!early_section(ms)) { - kfree(usage); - if (memmap) - depopulate_section_memmap(pfn, nr_pages, altmap); - return; - } - - /* - * The usemap came from bootmem. This is packed with other usemaps - * on the section which has pgdat at boot time. Just keep it as is now. - */ - - if (memmap) - free_map_bootmem(memmap); -} - -void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn, +void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - struct page *memmap = NULL; - struct mem_section_usage *usage = NULL; - - if (ms->section_mem_map) { - usage = ms->usage; - memmap = sparse_decode_mem_map(ms->section_mem_map, - __section_nr(ms)); - ms->section_mem_map = 0; - ms->usage = NULL; - } - - clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset); - free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap); + clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, + nr_pages - map_offset); + section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit