From b9ada4281c48076bdb252813be24ddb57e17cade Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 20 May 2008 11:01:08 +0100 Subject: x86: reinstate numa remap for SPARSEMEM on x86 NUMA systems Recent kernels have been panic'ing trying to allocate memory early in boot, in __alloc_pages: BUG: unable to handle kernel paging request at 00001568 IP: [] __alloc_pages+0x33/0x2cc *pdpt = 00000000013a5001 *pde = 0000000000000000 Oops: 0000 [#1] SMP Modules linked in: Pid: 1, comm: swapper Not tainted (2.6.25 #78) EIP: 0060:[] EFLAGS: 00010246 CPU: 0 EIP is at __alloc_pages+0x33/0x2cc EAX: 00001564 EBX: 000412d0 ECX: 00001564 EDX: 000005c3 ESI: f78012a0 EDI: 00000001 EBP: 00001564 ESP: f7871e50 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 1, ti=f7870000 task=f786f670 task.ti=f7870000) Stack: 00000000 f786f670 00000010 00000000 0000b700 000412d0 f78012a0 00000001 00000000 c105b64d 00000000 000412d0 f78012a0 f7803120 00000000 c105c1c5 00000010 f7803144 000412d0 00000001 f7803130 f7803120 f78012a0 00000001 Call Trace: [] kmem_getpages+0x94/0x129 [] cache_grow+0x8f/0x123 [] ____cache_alloc_node+0xb9/0xe4 [] kmem_cache_alloc_node+0x92/0xd2 [] build_sched_domains+0x536/0x70d [] do_flush_tlb_all+0x0/0x3f [] do_flush_tlb_all+0x0/0x3f [] interleave_nodes+0x23/0x5a [] alternate_node_alloc+0x43/0x5b [] arch_init_sched_domains+0x46/0x51 [] kernel_init+0x0/0x82 [] sched_init_smp+0x10/0xbb [] kernel_init+0x43/0x82 [] kernel_thread_helper+0x7/0x10 Debugging this showed that the NODE_DATA() for nodes other than node 0 were all NULL. Tracing this back showed that the NODE_DATA() pointers were being initialised to each nodes remap space. However under SPARSEMEM remap is disabled which leads to the pgdat's being placed incorrectly at kernel virtual address 0. Leading to the panic when attempting to allocate memory from these nodes. Numa remap was disabled in the commit below. This occured while fixing problems triggered when attempting to boot x86_32 NUMA SPARSEMEM kernels on non-numa hardware. x86: make NUMA work on 32-bit commit 1b000a5dbeb2f34bc03d45ebdf3f6d24a60c3aed The real problem is believed to be related to other alignment issues in the regions blocked out from the bootmem allocator for small memory systems, and has been fixed separately. Therefore re-enable remap for SPARSMEM, which fixes pgdat allocation issues. Testing confirms that SPARSMEM NUMA kernels will boot correctly with this part of the change reverted. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf983687..026201f143a8 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -164,16 +164,13 @@ static void __init allocate_pgdat(int nid) } } -#ifdef CONFIG_DISCONTIGMEM /* - * In the discontig memory model, a portion of the kernel virtual area (KVA) - * is reserved and portions of nodes are mapped using it. This is to allow - * node-local memory to be allocated for structures that would normally require - * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers - * should be prepared to allocate from the bootmem allocator instead. This KVA - * mechanism is incompatible with SPARSEMEM as it makes assumptions about the - * layout of memory that are broken if alloc_remap() succeeds for some of the - * map and fails for others + * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel + * virtual address space (KVA) is reserved and portions of nodes are mapped + * using it. This is to allow node-local memory to be allocated for + * structures that would normally require ZONE_NORMAL. The memory is + * allocated with alloc_remap() and callers should be prepared to allocate + * from the bootmem allocator instead. */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; @@ -290,25 +287,6 @@ static void init_remap_allocator(int nid) (ulong) pfn_to_kaddr(highstart_pfn + node_remap_offset[nid] + node_remap_size[nid])); } -#else -void *alloc_remap(int nid, unsigned long size) -{ - return NULL; -} - -static unsigned long calculate_numa_remap_pages(void) -{ - return 0; -} - -static void init_remap_allocator(int nid) -{ -} - -void __init remap_numa_kva(void) -{ -} -#endif /* CONFIG_DISCONTIGMEM */ extern void setup_bootmem_allocator(void); unsigned long __init setup_memory(void) -- cgit From 84d6bd0e272e6eace1e7e4c501f32bddfb665bb2 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 20 May 2008 11:01:19 +0100 Subject: x86: cope with no remap space being allocated for a numa node When allocating the pgdat's for numa nodes on x86_32 we attempt to place them in the numa remap space for that node. However should the node not have any remap space allocated (such as due to having non-ram pages in the remap location in the node) then we will incorrectly place the pgdat at zero. Check we have remap available, falling back to node 0 memory where we do not. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 026201f143a8..435c343c14bc 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -156,7 +156,7 @@ static void __init propagate_e820_map_node(int nid) */ static void __init allocate_pgdat(int nid) { - if (nid && node_has_online_mem(nid)) + if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); -- cgit From 163872950dc856fd23849c27f60049feaac49ae6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:57:22 -0700 Subject: x86: extend e820 early_res support 32bit -fix #4 reserve_early pgdata for 32bit numa Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf983687..47749727907e 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -159,8 +159,13 @@ static void __init allocate_pgdat(int nid) if (nid && node_has_online_mem(nid)) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { - NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); - min_low_pfn += PFN_UP(sizeof(pg_data_t)); + unsigned long pgdat_phys; + pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), + "NODE_DATA"); } } -- cgit From a5481280b29b6a3db912ec100498bd31eaa6d2db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 29 May 2008 12:58:37 -0700 Subject: x86: extend e820 early_res support 32bit -fix #5 reserve early numa kva, so it will not clash with new RAMDISK Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_32.c | 1 - arch/x86/mm/discontig_32.c | 12 +++++------- include/asm-x86/mmzone_32.h | 4 ---- 3 files changed, 5 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index de9c5ee77d07..6f40cb560ea9 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -614,7 +614,6 @@ void __init setup_bootmem_allocator(void) */ find_smp_config(); #endif - numa_kva_reserve(); reserve_crashkernel(); reserve_ibft_region(); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 47749727907e..55fdbab6b014 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -357,6 +357,11 @@ unsigned long __init setup_memory(void) printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); + + /* avoid clash with initrd */ + reserve_early(kva_start_pfn< system_max_low_pfn) @@ -392,13 +397,6 @@ unsigned long __init setup_memory(void) return max_low_pfn; } -void __init numa_kva_reserve(void) -{ - if (kva_pages) - reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), - BOOTMEM_DEFAULT); -} - void __init zone_sizes_init(void) { int nid; diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h index cb2cad0b65a7..faef751181b7 100644 --- a/include/asm-x86/mmzone_32.h +++ b/include/asm-x86/mmzone_32.h @@ -38,16 +38,12 @@ static inline void get_memcfg_numa(void) } extern int early_pfn_to_nid(unsigned long pfn); -extern void numa_kva_reserve(void); #else /* !CONFIG_NUMA */ #define get_memcfg_numa get_memcfg_numa_flat #define get_zholes_size(n) (0) -static inline void numa_kva_reserve(void) -{ -} #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM -- cgit From 2772f54bf37b033263abe4a2314c40a308a1a5cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 10:09:45 +0200 Subject: x86: add acpi_numa_slit_init() dummy implementation on 32-bit allow CONFIG_ACPI_NUMA builds to succeed on 32-bit. --- arch/x86/mm/discontig_32.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 435c343c14bc..d28987a2f2bb 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -454,3 +454,12 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif + +#ifdef CONFIG_ACPI_NUMA +/* + * Dummy on 32-bit, for now: + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ +} +#endif -- cgit From b28852d6703e4b72ce363c5168ea8d3fb28b9c57 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 10:11:07 +0200 Subject: x86: add dummy acpi_numa_processor_affinity_init() implementation on 32-bit allow CONFIG_ACPI_NUMA builds to succeed. --- arch/x86/mm/discontig_32.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index d28987a2f2bb..bb8544a7cb9d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -462,4 +462,9 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { } + +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ +} #endif -- cgit From cf3d0cb8a10a4fd19439bb34995d9ad28854739a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 11:36:56 +0200 Subject: x86: 32-bit numa, build fix on Summit it's possible to have: CONFIG_ACPI_SRAT=y CONFIG_HAVE_ARCH_PARSE_SRAT=y in which case acpi.h defines the acpi_numa_slit_init() and acpi_numa_processor_affinity_init() methods as a macro. --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index bb8544a7cb9d..0d28b4bff817 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -455,7 +455,7 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#ifdef CONFIG_ACPI_NUMA +#if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) /* * Dummy on 32-bit, for now: */ -- cgit From 471b3c1b011f807d16f1e19d1d4ecf703f1e7d1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Jun 2008 12:39:43 +0200 Subject: x86, numaq 32-bit: build fix fix: drivers/built-in.o: In function `acpi_numa_init': : undefined reference to `acpi_numa_arch_fixup' which can happen with ACPI && NUMAQ. --- arch/x86/mm/discontig_32.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 0d28b4bff817..8b4eac0ca07d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -467,4 +467,8 @@ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { } + +void __init acpi_numa_arch_fixup(void) +{ +} #endif -- cgit From ba924c81dd5a7a7fb5ded025af7fdd3b61f8ca67 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:51:51 -0700 Subject: x86, numa, 32-bit: increase max_elements to 1024 so every element will represent 64M instead of 256M. AMD opteron could have HW memory hole remapping, so could have [0, 8g + 64M) on node0. Reduce element size to 64M to keep that on node 0 Later we need to use find_e820_area() to allocate memory_node_map like on 64-bit. But need to move memory_present out of populate_mem_map... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 14 +++++++------- include/asm-x86/mmzone_32.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 55fdbab6b014..922af20d1d29 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; /* * 4) physnode_map - the mapping between a pfn and owning node * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 256Mb break (each element of the array will - * represent 256Mb of memory and will be marked by the node id. so, + * numa node on a 64Mb break (each element of the array will + * represent 64Mb of memory and will be marked by the node id. so, * if the first gig is on node 0, and the second gig is on node 1 * physnode_map will contain: * - * physnode_map[0-3] = 0; - * physnode_map[4-7] = 1; - * physnode_map[8- ] = -1; + * physnode_map[0-15] = 0; + * physnode_map[16-31] = 1; + * physnode_map[32- ] = -1; */ s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; EXPORT_SYMBOL(physnode_map); @@ -81,9 +81,9 @@ void memory_present(int nid, unsigned long start, unsigned long end) printk(KERN_DEBUG " "); for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk("%ld ", pfn); + printk(KERN_CONT "%ld ", pfn); } - printk("\n"); + printk(KERN_CONT "\n"); } unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, diff --git a/include/asm-x86/mmzone_32.h b/include/asm-x86/mmzone_32.h index faef751181b7..ab0012888c44 100644 --- a/include/asm-x86/mmzone_32.h +++ b/include/asm-x86/mmzone_32.h @@ -51,14 +51,14 @@ extern int early_pfn_to_nid(unsigned long pfn); /* * generic node memory support, the following assumptions apply: * - * 1) memory comes in 256Mb contigious chunks which are either present or not + * 1) memory comes in 64Mb contigious chunks which are either present or not * 2) we will not have more than 64Gb in total * * for now assume that 64Gb is max amount of RAM for whole system * 64Gb / 4096bytes/page = 16777216 pages */ #define MAX_NR_PAGES 16777216 -#define MAX_ELEMENTS 256 +#define MAX_ELEMENTS 1024 #define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS) extern s8 physnode_map[]; -- cgit From b66cd7207387b9b428aaf1988e21dd263c6a4928 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:53:47 -0700 Subject: x86: set node_remap_size[0] in fallback path ... otherwise alloc_remap will not get node_mem_map from kva area, and alloc_node_mem_map has to alloc_bootmem_node to get mem_map. It will use two low address copies ... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 922af20d1d29..98b099eeab3a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -124,6 +124,7 @@ int __init get_memcfg_numa_flat(void) node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); + node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); -- cgit From 0596152388e234efebce464355186ad9e16c8cb6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 31 May 2008 22:52:47 -0700 Subject: x86, 32-bit: change propagate_e820_map() back to find_max_pfn() we don't need to call memory_present that early. numa and sparse will call memory_present later and might even fail, it will call memory_present for the full range. also for sparse it will call alloc_bootmem ... before we set up bootmem. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820_32.c | 5 ++--- arch/x86/kernel/setup_32.c | 4 ++-- arch/x86/mm/discontig_32.c | 2 +- include/asm-x86/e820_32.h | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index db760d4706af..0c025d04fc2e 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -210,7 +210,7 @@ void __init init_iomem_resources(struct resource *code_resource, /* * Find the highest page frame number we have available */ -void __init propagate_e820_map(void) +void __init find_max_pfn(void) { int i; @@ -227,7 +227,6 @@ void __init propagate_e820_map(void) continue; if (end > max_pfn) max_pfn = end; - memory_present(0, start, end); } } @@ -361,7 +360,7 @@ static int __init parse_memmap(char *arg) * size before original memory map is * reset. */ - propagate_e820_map(); + find_max_pfn(); saved_max_pfn = max_pfn; #endif e820.nr_map = 0; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 6f40cb560ea9..29010420458d 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -730,10 +730,10 @@ void __init setup_arch(char **cmdline_p) efi_init(); /* update e820 for memory not covered by WB MTRRs */ - propagate_e820_map(); + find_max_pfn(); mtrr_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - propagate_e820_map(); + find_max_pfn(); max_low_pfn = setup_memory(); diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 98b099eeab3a..ebbbba338150 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void) printk("NUMA - single node, flat memory mode\n"); /* Run the memory configuration and find the top of memory. */ - propagate_e820_map(); + find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memory_present(0, 0, max_pfn); diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index 7ace82570a36..00fbc60b9d30 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -21,7 +21,7 @@ extern void setup_memory_map(void); extern void finish_e820_parsing(void); -extern void propagate_e820_map(void); +extern void find_max_pfn(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void limit_regions(unsigned long long size); extern void init_iomem_resources(struct resource *code_resource, -- cgit From e8c27ac9191ab9e6506ae5cbe70d87ac50f8e960 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 13:15:22 -0700 Subject: x86, numa, 32-bit: print out debug info on all kvas also fix the print out of node_remap_end_vaddr Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 9 +++++++-- mm/page_alloc.c | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index ebbbba338150..3150ad385672 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -168,6 +168,8 @@ static void __init allocate_pgdat(int nid) reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), "NODE_DATA"); } + printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", + nid, (unsigned long)NODE_DATA(nid)); } #ifdef CONFIG_DISCONTIGMEM @@ -208,8 +210,12 @@ void __init remap_numa_kva(void) int node; for_each_online_node(node) { + printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { vaddr = node_remap_start_vaddr[node]+(pfn<node_mem_map); +#endif free_area_init_core(pgdat, zones_size, zholes_size); } -- cgit From 287572cb38de7f270b59191a0fecfa5c5de7765d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 21:06:31 -0700 Subject: x86, numa, 32-bit: avoid clash between ramdisk and kva use find_e820_area to get address space... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 3150ad385672..73a983489c60 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -38,6 +38,7 @@ #include #include #include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -326,7 +327,6 @@ unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; - unsigned long wasted_pages; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -337,29 +337,18 @@ unsigned long __init setup_memory(void) */ get_memcfg_numa(); - kva_pages = calculate_numa_remap_pages(); + kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); /* partially used pages are not usable - thus round upwards */ system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - kva_start_pfn = find_max_low_pfn() - kva_pages; - -#ifdef CONFIG_BLK_DEV_INITRD - /* Numa kva area is below the initrd */ - if (initrd_start) - kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) - - kva_pages; -#endif - - /* - * We waste pages past at the end of the KVA for no good reason other - * than how it is located. This is bad. - */ - wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); - kva_start_pfn -= wasted_pages; - kva_pages += wasted_pages; - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); + kva_start_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + kva_start_pfn = find_e820_area(kva_start_pfn<> PAGE_SHIFT; + printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); printk("max_pfn = %ld\n", max_pfn); -- cgit From 6af61a7614a306fe882a0c2b4ddc63b65aa66efc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 23:53:50 -0700 Subject: x86: clean up max_pfn_mapped usage - 32-bit on 32-bit in head_32.S after initial page table is done, we get initial max_pfn_mapped, and then kernel_physical_mapping_init will give us a final one. We need to use that to make sure find_e820_area will get valid addresses for boot_map and for NODE_DATA(0) on numa32. XEN PV and lguest may need to assign max_pfn_mapped too. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 4 ++++ arch/x86/kernel/setup_32.c | 4 +++- arch/x86/mm/discontig_32.c | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index bef4618feadb..ac7002fdd637 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -220,6 +220,8 @@ default_entry: jb 10b 1: movl %edi,pa(init_pg_tables_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax @@ -251,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20); cmpl %ebp,%eax jb 10b movl %edi,pa(init_pg_tables_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 29010420458d..c985a8c305e0 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -586,7 +586,7 @@ void __init setup_bootmem_allocator(void) */ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<> PAGE_SHIFT, max_low_pfn); + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped<>PAGE_SHIFT)); reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), -- cgit From 84b56fa46b36c2df508e7d421feab514fad30f81 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 3 Jun 2008 19:32:30 -0700 Subject: x86, numa, 32-bit: make sure get we kva space when 1/3 user/kernel split is used, and less memory is installed, or if we have a big hole below 4g, max_low_pfn is still using 3g-128m try to go down from max_low_pfn until we get it. otherwise will panic. need to make 32-bit code to use register_e820_active_regions ... later. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914a81ee7855..7ced26ab9aec 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -328,6 +328,7 @@ unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; + long kva_target_pfn; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -344,11 +345,17 @@ unsigned long __init setup_memory(void) system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); system_max_low_pfn = max_low_pfn = find_max_low_pfn(); - kva_start_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - kva_start_pfn = find_e820_area(kva_start_pfn<> PAGE_SHIFT; + kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + do { + kva_start_pfn = find_e820_area(kva_target_pfn<> PAGE_SHIFT; + kva_target_pfn -= PTRS_PER_PTE; + } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); + + if (kva_start_pfn == -1UL) + panic("Can not get kva space\n"); printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", kva_start_pfn, max_low_pfn); -- cgit From 7b2a0a6c4866cac146dcb0433e6984eb19a81335 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 3 Jun 2008 19:35:04 -0700 Subject: x86: make 32-bit use e820_register_active_regions() this way 32-bit is more similar to 64-bit, and smarter e820 and numa. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820_32.c | 68 ++-------------------------------------------- arch/x86/kernel/numaq_32.c | 3 ++ arch/x86/kernel/setup_32.c | 24 ++++++++++++---- arch/x86/kernel/srat_32.c | 4 ++- arch/x86/mm/discontig_32.c | 19 ++++--------- include/asm-x86/e820_32.h | 2 -- 6 files changed, 33 insertions(+), 87 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 0c025d04fc2e..e8a3b968c9fa 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -207,69 +207,6 @@ void __init init_iomem_resources(struct resource *code_resource, } } -/* - * Find the highest page frame number we have available - */ -void __init find_max_pfn(void) -{ - int i; - - max_pfn = 0; - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - /* RAM? */ - if (e820.map[i].type != E820_RAM) - continue; - start = PFN_UP(e820.map[i].addr); - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - if (start >= end) - continue; - if (end > max_pfn) - max_pfn = end; - } -} - -/* - * Register fully available low RAM pages with the bootmem allocator. - */ -void __init register_bootmem_low_pages(unsigned long max_low_pfn) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - unsigned long curr_pfn, last_pfn, size; - /* - * Reserve usable low memory - */ - if (e820.map[i].type != E820_RAM) - continue; - /* - * We are rounding up the start address of usable memory: - */ - curr_pfn = PFN_UP(e820.map[i].addr); - if (curr_pfn >= max_low_pfn) - continue; - /* - * ... and at the end of the usable range downwards: - */ - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - - /* - * .. finally, did all the rounding and playing - * around just make the area go away? - */ - if (last_pfn <= curr_pfn) - continue; - - size = last_pfn - curr_pfn; - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); - } -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr; @@ -360,8 +297,9 @@ static int __init parse_memmap(char *arg) * size before original memory map is * reset. */ - find_max_pfn(); - saved_max_pfn = max_pfn; + e820_register_active_regions(0, 0, -1UL); + saved_max_pfn = e820_end_of_ram(); + remove_all_active_ranges(); #endif e820.nr_map = 0; user_defined_memmap = 1; diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 922be66668b8..27b908254fb0 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -32,6 +32,7 @@ #include #include #include +#include #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) @@ -61,6 +62,8 @@ static void __init smp_dump_qct(void) node_end_pfn[node] = MB_TO_PAGES( eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); + e820_register_active_regions(node, node_start_pfn[node], + node_end_pfn[node]); memory_present(node, node_start_pfn[node], node_end_pfn[node]); node_remap_size[node] = node_memmap_size_bytes(node, diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index ccf3595202fa..0ec6480aaa27 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -405,11 +405,12 @@ static void __init zone_sizes_init(void) max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + remove_all_active_ranges(); #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; - add_active_range(0, 0, highend_pfn); + e820_register_active_regions(0, 0, highend_pfn); #else - add_active_range(0, 0, max_low_pfn); + e820_register_active_regions(0, 0, max_low_pfn); #endif free_area_init_nodes(max_zone_pfns); @@ -582,6 +583,7 @@ static void __init relocate_initrd(void) void __init setup_bootmem_allocator(void) { + int i; unsigned long bootmap_size, bootmap; /* * Initialize the boot-time allocator (with low memory only): @@ -603,7 +605,8 @@ void __init setup_bootmem_allocator(void) min_low_pfn< #include #include +#include /* * proximity macros and definitions @@ -244,7 +245,8 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); node_read_chunk(chunk->nid, chunk); - add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); + e820_register_active_regions(chunk->nid, chunk->start_pfn, + min(chunk->end_pfn, max_pfn)); } for_each_online_node(nid) { diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 7ced26ab9aec..a89ccf3d4c14 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -120,10 +120,9 @@ int __init get_memcfg_numa_flat(void) { printk("NUMA - single node, flat memory mode\n"); - /* Run the memory configuration and find the top of memory. */ - find_max_pfn(); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; + e820_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); @@ -337,6 +336,11 @@ unsigned long __init setup_memory(void) * this space and use it to adjust the boundary between ZONE_NORMAL * and ZONE_HIGHMEM. */ + + /* call find_max_low_pfn at first, it could update max_pfn */ + system_max_low_pfn = max_low_pfn = find_max_low_pfn(); + + remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); @@ -344,7 +348,6 @@ unsigned long __init setup_memory(void) /* partially used pages are not usable - thus round upwards */ system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { kva_start_pfn = find_e820_area(kva_target_pfn< Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- include/linux/mm.h | 3 +-- mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 36 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a89ccf3d4c14..489605bab85a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -282,7 +282,7 @@ static unsigned long calculate_numa_remap_pages(void) node_end_pfn[nid] -= size; node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); + shrink_active_range(nid, node_end_pfn[nid]); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index c31a9cd2a30e..7cbd949f2516 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,8 +997,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn); +extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 502223c3c2c6..215408684076 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3579,25 +3579,49 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, /** * shrink_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @old_end_pfn: The old end PFN of the range * @new_end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept at the end physical page range that has already been - * registered with add_active_range(). This function allows an arch to shrink - * an existing registered range. + * The map is kept near the end physical page range that has already been + * registered. This function allows an arch to shrink an existing registered + * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn) +void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) { - int i; + int i, j; + int removed = 0; /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) - if (early_node_map[i].end_pfn == old_end_pfn) { + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].start_pfn >= new_end_pfn) { + /* clear it */ + early_node_map[i].end_pfn = 0; + removed = 1; + continue; + } + if (early_node_map[i].end_pfn > new_end_pfn) { early_node_map[i].end_pfn = new_end_pfn; - break; + continue; } + } + + if (!removed) + return; + + /* remove the blank ones */ + for (i = nr_nodemap_entries - 1; i > 0; i--) { + if (early_node_map[i].nid != nid) + continue; + if (early_node_map[i].end_pfn) + continue; + /* we found it, get rid of it */ + for (j = i; j < nr_nodemap_entries - 1; j++) + memcpy(&early_node_map[j], &early_node_map[j+1], + sizeof(early_node_map[j])); + j = nr_nodemap_entries - 1; + memset(&early_node_map[j], 0, sizeof(early_node_map[j])); + nr_nodemap_entries--; + } } /** -- cgit From 9043f007963f4039befa3c31f47173f74a0b1c70 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 6 Jun 2008 18:53:33 -0700 Subject: x86, numa, 32-bit: use find_e820_area() to find KVA RAM on node don't assume we can use RAM near the end of every node. Esp systems that have few memory and they could have kva address and kva RAM all below max_low_pfn. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 59 ++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 26 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 489605bab85a..accc7c6c57fc 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -228,17 +228,21 @@ static unsigned long calculate_numa_remap_pages(void) { int nid; unsigned long size, reserve_pages = 0; - unsigned long pfn; for_each_online_node(nid) { - unsigned old_end_pfn = node_end_pfn[nid]; + u64 node_end_target; + u64 node_end_final; /* * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. */ + printk("node %d pfn: [%lx - %lx]\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) continue; + if (!node_end_pfn[nid]) + continue; if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; @@ -250,37 +254,40 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - /* - * Validate the region we are allocating only contains valid - * pages. - */ - for (pfn = node_end_pfn[nid] - size; - pfn < node_end_pfn[nid]; pfn++) - if (!page_is_ram(pfn)) - break; - - if (pfn != node_end_pfn[nid]) - size = 0; + node_end_target = round_down(node_end_pfn[nid] - size, + PTRS_PER_PTE); + node_end_target <<= PAGE_SHIFT; + do { + node_end_final = find_e820_area(node_end_target, + ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + + if (node_end_final == -1ULL) + panic("Can not get kva ram\n"); printk("Reserving %ld pages of KVA for lmem_map of node %d\n", size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %ld pages\n", - nid, node_end_pfn[nid], node_end_pfn[nid] - size); - - if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { - /* - * Align node_end_pfn[] and node_remap_start_pfn[] to - * pmd boundary. remap_numa_kva will barf otherwise. - */ - printk("Shrinking node %d further by %ld pages for proper alignment\n", - nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); - size += node_end_pfn[nid] & (PTRS_PER_PTE-1); - } + printk("Shrinking node %d from %ld pages to %lld pages\n", + nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + + /* + * prevent kva address below max_low_pfn want it on system + * with less memory later. + * layout will be: KVA address , KVA RAM + */ + if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) + reserve_early(node_end_final, + node_end_final+(((u64)size)<>PAGE_SHIFT; node_remap_start_pfn[nid] = node_end_pfn[nid]; shrink_active_range(nid, node_end_pfn[nid]); } -- cgit From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++--------------------- include/linux/mm.h | 3 ++- mm/page_alloc.c | 29 +++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 28 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index accc7c6c57fc..c3f119e99e0d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_end_target; - u64 node_end_final; + u64 node_kva_target; + u64 node_kva_final; /* * The acpi/srat node info can show hot-add memroy zones @@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_end_target = round_down(node_end_pfn[nid] - size, + node_kva_target = round_down(node_end_pfn[nid] - size, PTRS_PER_PTE); - node_end_target <<= PAGE_SHIFT; + node_kva_target <<= PAGE_SHIFT; do { - node_end_final = find_e820_area(node_end_target, + node_kva_final = find_e820_area(node_kva_target, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + node_kva_target -= LARGE_PAGE_BYTES; + } while (node_kva_final == -1ULL && + (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_end_final == -1ULL) + if (node_kva_final == -1ULL) panic("Can not get kva ram\n"); - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %lld pages\n", - nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system * with less memory later. * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array */ - if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) - reserve_early(node_end_final, - node_end_final+(((u64)size)<>PAGE_SHIFT; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, node_end_pfn[nid]); + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index ce8e397a61f6..034a3156d2f0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -998,7 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); +extern void remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee5ba7509c1..d80e1868e570 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /** - * shrink_active_range - Shrink an existing registered range of PFNs + * remove_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @new_end_pfn: The new PFN of the range + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. * The map is kept near the end physical page range that has already been * registered. This function allows an arch to shrink an existing registered * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) { int i, j; int removed = 0; + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + /* Find the old active region end and shrink */ for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= new_end_pfn) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { /* clear it */ + early_node_map[i].start_pfn = 0; early_node_map[i].end_pfn = 0; removed = 1; continue; } - if (early_node_map[i].end_pfn > new_end_pfn) { - early_node_map[i].end_pfn = new_end_pfn; + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; continue; } } -- cgit From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 -------- arch/x86/mm/discontig_32.c | 19 ++++++-------- arch/x86/mm/init_32.c | 62 ++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/e820.h | 1 - include/asm-x86/highmem.h | 3 +++ include/linux/mm.h | 2 ++ mm/page_alloc.c | 8 ++++++ 7 files changed, 71 insertions(+), 35 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5051ce744b4e..ed46b7a6bc13 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end) early_res[j - 1].end = 0; } -int __init page_is_reserved_early(unsigned long pagenr) -{ - u64 start = (u64)pagenr << PAGE_SHIFT; - int i; - struct early_res *r; - - i = find_overlapped_early(start, start + PAGE_SIZE); - r = &early_res[i]; - return (i < MAX_EARLY_RES && r->end); -} - void __init early_res_to_bootmem(u64 start, u64 end) { int i; diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c3f119e99e0d..7c4d0255f8d8 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, #endif extern unsigned long find_max_low_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; - struct page *page; + int nid; for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; if (!is_highmem(zone)) continue; @@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro) zone_start_pfn = zone->zone_start_pfn; zone_end_pfn = zone_start_pfn + zone->spanned_pages; + nid = zone_to_nid(zone); printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone_to_nid(zone), - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); - } + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn, bad_ppro); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index abadb1da70df..ba07a489230e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init +add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && - !page_is_reserved_early(pfn)) { + if (!(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); init_page_count(page); __free_page(page); @@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } +struct add_highpages_data { + unsigned long start_pfn; + unsigned long end_pfn; + int bad_ppro; +}; + +static void __init add_highpages_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int node_pfn; + struct page *page; + unsigned long final_start_pfn, final_end_pfn; + struct add_highpages_data *data; + int bad_ppro; + + data = (struct add_highpages_data *)datax; + bad_ppro = data->bad_ppro; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return; + + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + +} + +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, + int bad_ppro) +{ + struct add_highpages_data data; + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; + data.bad_ppro = bad_ppro; + + work_with_active_regions(nid, add_highpages_work_fn, &data); +} + #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { - int pfn; + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, + bad_ppro); - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { - /* - * Holes under sparsemem might not have no mem_map[]: - */ - if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - } totalram_pages += totalhigh_pages; } #endif /* !CONFIG_NUMA */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 6b0ce745a60c..55d310596907 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); -extern int page_is_reserved_early(unsigned long pagenr); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); extern unsigned long e820_end_of_ram(void); diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index e153f3b44774..85c4fea41ff6 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, int bad_ppro); + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 034a3156d2f0..e4de460907c1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); +typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID extern int early_pfn_to_nid(unsigned long pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d80e1868e570..41c6e3aa059f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid, } } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, + data); +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit From cc9f7a0ccf000d4db5fbdc7b0ae48eefea102f69 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 16:11:08 -0700 Subject: x86: kill bad_ppro so don't punish all other cpus without that problem when init highmem Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_32.c | 9 +++++++++ arch/x86/mm/discontig_32.c | 4 ++-- arch/x86/mm/init_32.c | 43 ++++++++++++------------------------------- include/asm-x86/highmem.h | 2 +- include/asm-x86/numa_32.h | 2 +- 5 files changed, 25 insertions(+), 35 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index f3ddba5ed9a7..9692aeb8ecae 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -68,6 +68,7 @@ #include #include #include +#include /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* @@ -764,6 +765,14 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_init(); + if (ppro_with_ram_bug()) { + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, + E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + printk(KERN_INFO "fixed physical RAM map:\n"); + e820_print_map("bad_ppro"); + } + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 7c4d0255f8d8..6216e43b6e95 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -427,7 +427,7 @@ void __init zone_sizes_init(void) return; } -void __init set_highmem_pages_init(int bad_ppro) +void __init set_highmem_pages_init(void) { #ifdef CONFIG_HIGHMEM struct zone *zone; @@ -447,7 +447,7 @@ void __init set_highmem_pages_init(int bad_ppro) zone->name, nid, zone_start_pfn, zone_end_pfn); add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn, bad_ppro); + zone_end_pfn); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ba07a489230e..fb5694d788bf 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -220,13 +220,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) } } -static inline int page_kills_ppro(unsigned long pagenr) -{ - if (pagenr >= 0x70000 && pagenr <= 0x7003F) - return 1; - return 0; -} - /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. @@ -287,22 +280,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __init -add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init add_one_highpage_init(struct page *page, int pfn) { - if (!(bad_ppro && page_kills_ppro(pfn))) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalhigh_pages++; - } else - SetPageReserved(page); + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalhigh_pages++; } struct add_highpages_data { unsigned long start_pfn; unsigned long end_pfn; - int bad_ppro; }; static void __init add_highpages_work_fn(unsigned long start_pfn, @@ -312,10 +300,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, struct page *page; unsigned long final_start_pfn, final_end_pfn; struct add_highpages_data *data; - int bad_ppro; data = (struct add_highpages_data *)datax; - bad_ppro = data->bad_ppro; final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); @@ -327,29 +313,26 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); + add_one_highpage_init(page, node_pfn); } } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn, - int bad_ppro) + unsigned long end_pfn) { struct add_highpages_data data; data.start_pfn = start_pfn; data.end_pfn = end_pfn; - data.bad_ppro = bad_ppro; work_with_active_regions(nid, add_highpages_work_fn, &data); } #ifndef CONFIG_NUMA -static void __init set_highmem_pages_init(int bad_ppro) +static void __init set_highmem_pages_init(void) { - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, - bad_ppro); + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); totalram_pages += totalhigh_pages; } @@ -358,7 +341,7 @@ static void __init set_highmem_pages_init(int bad_ppro) #else # define kmap_init() do { } while (0) # define permanent_kmaps_init(pgd_base) do { } while (0) -# define set_highmem_pages_init(bad_ppro) do { } while (0) +# define set_highmem_pages_init() do { } while (0) #endif /* CONFIG_HIGHMEM */ pteval_t __PAGE_KERNEL = _PAGE_KERNEL; @@ -605,13 +588,11 @@ static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; - int tmp, bad_ppro; + int tmp; #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif - bad_ppro = ppro_with_ram_bug(); - #ifdef CONFIG_HIGHMEM /* check that fixmap and pkmap do not overlap */ if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { @@ -634,7 +615,7 @@ void __init mem_init(void) if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; - set_highmem_pages_init(bad_ppro); + set_highmem_pages_init(); codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index 85c4fea41ff6..4514b16cc723 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -75,7 +75,7 @@ struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn, int bad_ppro); + unsigned long end_pfn); #endif /* __KERNEL__ */ diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h index 03d0f7a9bf02..a02674f64869 100644 --- a/include/asm-x86/numa_32.h +++ b/include/asm-x86/numa_32.h @@ -5,7 +5,7 @@ extern int pxm_to_nid(int pxm); #ifdef CONFIG_NUMA extern void __init remap_numa_kva(void); -extern void set_highmem_pages_init(int); +extern void set_highmem_pages_init(void); #else static inline void remap_numa_kva(void) { -- cgit From b2ac82a0909aea0d2620ba4c189f37c567c21fe5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 22 Jun 2008 02:45:39 -0700 Subject: x86: introduce initmem_init for 32 bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_32.c | 94 +--------------------------------------------- arch/x86/mm/discontig_32.c | 4 +- arch/x86/mm/init_32.c | 90 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/page_32.h | 5 +++ 4 files changed, 99 insertions(+), 94 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index d24ac268523f..190546bd3bd3 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -300,71 +300,11 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -#ifndef CONFIG_NEED_MULTIPLE_NODES -static void __init setup_bootmem_allocator(void); -static unsigned long __init setup_memory(void) -{ - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - min_low_pfn = PFN_UP(init_pg_tables_end); - - max_low_pfn = find_max_low_pfn(); - -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > max_low_pfn) { - highstart_pfn = max_low_pfn; - } - memory_present(0, 0, highend_pfn); - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - memory_present(0, 0, max_low_pfn); - num_physpages = max_low_pfn; - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif -#ifdef CONFIG_FLATMEM - max_mapnr = num_physpages; -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - - setup_bootmem_allocator(); - - return max_low_pfn; -} - -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - remove_all_active_ranges(); -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; - e820_register_active_regions(0, 0, highend_pfn); -#else - e820_register_active_regions(0, 0, max_low_pfn); -#endif - - free_area_init_nodes(max_zone_pfns); -} -#else -extern unsigned long __init setup_memory(void); -extern void zone_sizes_init(void); -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ - #ifdef CONFIG_BLK_DEV_INITRD static bool do_relocate_initrd = false; -static void __init reserve_initrd(void) +void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; @@ -480,36 +420,6 @@ static void __init relocate_initrd(void) #endif /* CONFIG_BLK_DEV_INITRD */ -void __init setup_bootmem_allocator(void) -{ - int i; - unsigned long bootmap_size, bootmap; - /* - * Initialize the boot-time allocator (with low memory only): - */ - bootmap_size = bootmem_bootmap_pages(max_low_pfn)<> PAGE_SHIFT, max_low_pfn); - printk(KERN_INFO " mapped low ram: 0 - %08lx\n", - max_pfn_mapped<0 pgdats have their virtual @@ -666,7 +576,7 @@ void __init setup_arch(char **cmdline_p) acpi_numa_init(); #endif - max_low_pfn = setup_memory(); + max_low_pfn = initmem_init(0, max_pfn); #ifdef CONFIG_ACPI_SLEEP /* diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a2f73ba42b8b..3e75be46c4f2 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -309,8 +309,8 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -extern void setup_bootmem_allocator(void); -unsigned long __init setup_memory(void) +unsigned long __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) { int nid; unsigned long system_start_pfn, system_max_low_pfn; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a0484adbf59d..9bc8607d7980 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -540,6 +540,96 @@ static void __init set_nx(void) } #endif +#ifndef CONFIG_NEED_MULTIPLE_NODES +extern unsigned long find_max_low_pfn(void); +unsigned long __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) +{ + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(init_pg_tables_end); + + max_low_pfn = find_max_low_pfn(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + memory_present(0, 0, highend_pfn); + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + memory_present(0, 0, max_low_pfn); + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); + + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + remove_all_active_ranges(); +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; + e820_register_active_regions(0, 0, highend_pfn); +#else + e820_register_active_regions(0, 0, max_low_pfn); +#endif + + free_area_init_nodes(max_zone_pfns); +} +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +extern void reserve_initrd(void); + +void __init setup_bootmem_allocator(void) +{ + int i; + unsigned long bootmap_size, bootmap; + /* + * Initialize the boot-time allocator (with low memory only): + */ + bootmap_size = bootmem_bootmap_pages(max_low_pfn)<> PAGE_SHIFT, max_low_pfn); + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped< -- cgit From 2ec65f8b89ea003c27ff7723525a2ee335a2b393 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 23 Jun 2008 03:05:30 -0700 Subject: x86: clean up using max_low_pfn on 32-bit so that max_low_pfn is not changed after it is set. so we can move that early and out of initmem_init. could call find_low_pfn_range just after max_pfn is set. also could move reserve_initrd out of setup_bootmem_allocator so 32bit is more like 64bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_32.c | 16 ++++++++++------ arch/x86/mm/discontig_32.c | 22 +++++++--------------- arch/x86/mm/init_32.c | 25 +++++++++---------------- include/asm-x86/page_32.h | 3 ++- include/asm-x86/setup.h | 2 -- 5 files changed, 28 insertions(+), 40 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 9a08490a3889..b42f570a5a56 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -188,13 +188,14 @@ static inline void copy_edd(void) static bool do_relocate_initrd = false; -void __init reserve_initrd(void) +static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; u64 ramdisk_here; + u64 ramdisk_target; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -202,7 +203,7 @@ void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_size >= end_of_lowmem/2) { + if (ramdisk_size >= (end_of_lowmem>>1)) { free_early(ramdisk_image, ramdisk_end); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); @@ -225,7 +226,8 @@ void __init reserve_initrd(void) } /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(min_low_pfn<>1), end_of_lowmem, ramdisk_size, PAGE_SIZE); @@ -346,8 +348,6 @@ static void set_mca_bus(int x) { } */ void __init setup_arch(char **cmdline_p) { - unsigned long max_low_pfn; - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); @@ -450,6 +450,10 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820_end_of_ram(); } + find_low_pfn_range(); + + reserve_initrd(); + dmi_scan_machine(); io_delay_init(); @@ -466,7 +470,7 @@ void __init setup_arch(char **cmdline_p) acpi_numa_init(); #endif - max_low_pfn = initmem_init(0, max_pfn); + initmem_init(0, max_pfn); #ifdef CONFIG_ACPI_SLEEP /* diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 3e75be46c4f2..1dfff700264c 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -309,11 +309,10 @@ static void init_remap_allocator(int nid) (ulong) node_remap_end_vaddr[nid]); } -unsigned long __init initmem_init(unsigned long start_pfn, +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int nid; - unsigned long system_start_pfn, system_max_low_pfn; long kva_target_pfn; /* @@ -324,17 +323,11 @@ unsigned long __init initmem_init(unsigned long start_pfn, * and ZONE_HIGHMEM. */ - /* call find_max_low_pfn at first, it could update max_pfn */ - system_max_low_pfn = max_low_pfn = find_max_low_pfn(); - remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); - /* partially used pages are not usable - thus round upwards */ - system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { kva_start_pfn = find_e820_area(kva_target_pfn< system_max_low_pfn) - highstart_pfn = system_max_low_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = system_max_low_pfn; - high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); - printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", + pages_to_mb(max_low_pfn)); + printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", min_low_pfn, max_low_pfn, highstart_pfn); printk("Low memory ends at vaddr %08lx\n", @@ -387,7 +380,6 @@ unsigned long __init initmem_init(unsigned long start_pfn, memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); NODE_DATA(0)->bdata = &node0_bdata; setup_bootmem_allocator(); - return max_low_pfn; } void __init zone_sizes_init(void) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d1017336f1b5..27b829312944 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -561,9 +561,15 @@ early_param("highmem", parse_highmem); /* * Determine low and high memory ranges: */ -unsigned long __init find_max_low_pfn(void) +void __init find_low_pfn_range(void) { - unsigned long max_low_pfn; + /* it could update max_pfn */ + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(init_pg_tables_end); max_low_pfn = max_pfn; if (max_low_pfn > MAXMEM_PFN) { @@ -625,21 +631,12 @@ unsigned long __init find_max_low_pfn(void) " kernel!\n"); #endif } - return max_low_pfn; } #ifndef CONFIG_NEED_MULTIPLE_NODES -unsigned long __init initmem_init(unsigned long start_pfn, +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - min_low_pfn = PFN_UP(init_pg_tables_end); - - max_low_pfn = find_max_low_pfn(); - #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) @@ -661,8 +658,6 @@ unsigned long __init initmem_init(unsigned long start_pfn, pages_to_mb(max_low_pfn)); setup_bootmem_allocator(); - - return max_low_pfn; } void __init zone_sizes_init(void) @@ -699,8 +694,6 @@ void __init setup_bootmem_allocator(void) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); - reserve_initrd(); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< void reserve_standard_io_resources(void); -void reserve_initrd(void); - #ifndef _SETUP -- cgit From c09434571d4b1d8abf530ba4ce28cb868b45f2e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 23 Jun 2008 16:41:30 -0700 Subject: x86: numa32 pfn print out using hex instead Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/srat_32.c | 31 +++++++++++++++++++------------ arch/x86/mm/discontig_32.c | 29 +++++++++++++++-------------- 2 files changed, 34 insertions(+), 26 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 5978023b799b..f41d67f8f831 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -93,7 +93,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; - printk("CPU 0x%02X in proximity domain 0x%02X\n", + printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); } @@ -134,7 +134,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) if (num_memory_chunks >= MAXCHUNKS) { - printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", + printk(KERN_WARNING "Too many mem chunks in SRAT." + " Ignoring %lld MBytes at %llx\n", size/(1024*1024), paddr); return; } @@ -155,7 +156,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) num_memory_chunks++; - printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", + printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)" + " in proximity domain %02x %s\n", start_pfn, end_pfn, memory_affinity->memory_type, pxm, @@ -186,7 +188,7 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c * *possible* memory hotplug areas the same as normal RAM. */ if (memory_chunk->start_pfn >= max_pfn) { - printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", + printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", memory_chunk->start_pfn, memory_chunk->end_pfn); return; } @@ -212,7 +214,8 @@ int __init get_memcfg_from_srat(void) goto out_fail; if (num_memory_chunks == 0) { - printk("could not finy any ACPI SRAT memory areas.\n"); + printk(KERN_WARNING + "could not finy any ACPI SRAT memory areas.\n"); goto out_fail; } @@ -239,20 +242,23 @@ int __init get_memcfg_from_srat(void) for (i = 0; i < num_memory_chunks; i++) node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); - printk("pxm bitmap: "); + printk(KERN_DEBUG "pxm bitmap: "); for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk("%02X ", pxm_bitmap[i]); + printk(KERN_CONT "%02x ", pxm_bitmap[i]); } - printk("\n"); - printk("Number of logical nodes in system = %d\n", num_online_nodes()); - printk("Number of memory chunks in system = %d\n", num_memory_chunks); + printk(KERN_CONT "\n"); + printk(KERN_DEBUG "Number of logical nodes in system = %d\n", + num_online_nodes()); + printk(KERN_DEBUG "Number of memory chunks in system = %d\n", + num_memory_chunks); for (i = 0; i < MAX_APICID; i++) apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); for (j = 0; j < num_memory_chunks; j++){ struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; - printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", + printk(KERN_DEBUG + "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); node_read_chunk(chunk->nid, chunk); e820_register_active_regions(chunk->nid, chunk->start_pfn, @@ -268,6 +274,7 @@ int __init get_memcfg_from_srat(void) } return 1; out_fail: - printk("failed to get NUMA memory information from SRAT table\n"); + printk(KERN_ERR "failed to get NUMA memory information from SRAT" + " table\n"); return 0; } diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 1dfff700264c..f5ae31935ca8 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -76,13 +76,13 @@ void memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; - printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", + printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", nid, start, end); printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); printk(KERN_DEBUG " "); for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk(KERN_CONT "%ld ", pfn); + printk(KERN_CONT "%lx ", pfn); } printk(KERN_CONT "\n"); } @@ -117,7 +117,7 @@ static unsigned long kva_pages; */ int __init get_memcfg_numa_flat(void) { - printk("NUMA - single node, flat memory mode\n"); + printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; @@ -233,7 +233,7 @@ static unsigned long calculate_numa_remap_pages(void) * The acpi/srat node info can show hot-add memroy zones * where memory could be added but not currently present. */ - printk("node %d pfn: [%lx - %lx]\n", + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) continue; @@ -268,7 +268,8 @@ static unsigned long calculate_numa_remap_pages(void) node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" + " node %d at %llx\n", size, nid, node_kva_final>>PAGE_SHIFT); /* @@ -290,7 +291,7 @@ static unsigned long calculate_numa_remap_pages(void) remove_active_range(nid, node_remap_start_pfn[nid], node_remap_start_pfn[nid] + size); } - printk("Reserving total of %ld pages for numa KVA remap\n", + printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", reserve_pages); return reserve_pages; } @@ -304,7 +305,7 @@ static void init_remap_allocator(int nid) node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + ALIGN(sizeof(pg_data_t), PAGE_SIZE); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, (ulong) node_remap_start_vaddr[nid], (ulong) node_remap_end_vaddr[nid]); } @@ -340,9 +341,9 @@ void __init initmem_init(unsigned long start_pfn, if (kva_start_pfn == -1UL) panic("Can not get kva space\n"); - printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", + printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", kva_start_pfn, max_low_pfn); - printk("max_pfn = %ld\n", max_pfn); + printk(KERN_INFO "max_pfn = %lx\n", max_pfn); /* avoid clash with initrd */ reserve_early(kva_start_pfn<spanned_pages; nid = zone_to_nid(zone); - printk("Initializing %s for node %d (%08lx:%08lx)\n", + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", zone->name, nid, zone_start_pfn, zone_end_pfn); add_highpages_with_active_regions(nid, zone_start_pfn, -- cgit From 3a58a2a6c879b2e47daafd6e641661c50ac9da5a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 24 Jun 2008 12:19:41 -0700 Subject: x86: introduce init_memory_mapping for 32bit #3 move kva related early backto initmem_init for numa32 Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 8 ++++++-- arch/x86/mm/init_32.c | 20 -------------------- include/asm-x86/numa_32.h | 5 ----- 3 files changed, 6 insertions(+), 27 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index f5ae31935ca8..42484d446d06 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -200,7 +200,7 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -void __init remap_numa_kva(void) +static void __init remap_numa_kva(void) { void *vaddr; unsigned long pfn; @@ -373,12 +373,16 @@ void __init initmem_init(unsigned long start_pfn, allocate_pgdat(nid); } + remap_numa_kva(); + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); for_each_online_node(nid) propagate_e820_map_node(nid); - memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); + for_each_online_node(nid) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + NODE_DATA(0)->bdata = &node0_bdata; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ecccb055b085..c059c460e32d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -449,7 +449,6 @@ static void __init pagetable_init(void) paravirt_pagetable_setup_start(pgd_base); - remap_numa_kva(); /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): @@ -724,24 +723,6 @@ void __init setup_bootmem_allocator(void) after_init_bootmem = 1; } -/* - * The node 0 pgdat is initialized before all of these because - * it's needed for bootmem. node>0 pgdats have their virtual - * space allocated before the pagetables are in place to access - * them, so they can't be cleared then. - * - * This should all compile down to nothing when NUMA is off. - */ -static void __init remapped_pgdat_init(void) -{ - int nid; - - for_each_online_node(nid) { - if (nid != 0) - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - } -} - static void __init find_early_table_space(unsigned long end) { unsigned long puds, pmds, tables, start; @@ -831,7 +812,6 @@ void __init paging_init(void) /* * NOTE: at this point the bootmem allocator is fully available. */ - remapped_pgdat_init(); sparse_init(); zone_sizes_init(); diff --git a/include/asm-x86/numa_32.h b/include/asm-x86/numa_32.h index ed6c88eac848..220d7b7707a0 100644 --- a/include/asm-x86/numa_32.h +++ b/include/asm-x86/numa_32.h @@ -5,12 +5,7 @@ extern int pxm_to_nid(int pxm); extern void numa_remove_cpu(int cpu); #ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); extern void set_highmem_pages_init(void); -#else -static inline void remap_numa_kva(void) -{ -} #endif #endif /* _ASM_X86_32_NUMA_H */ -- cgit From 996cf4438f9ed711c98a3cb5ab88f842a4102427 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 30 Jun 2008 18:34:58 -0700 Subject: x86: don't reallocate pgt for node0 kva ram already mapped right after away, so don't need to get that for low ram. avoid wasting one copy of pgdat. also add node id in early_res name in case we get it from find_e820_area. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 42484d446d06..e2f0dd13a451 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -156,17 +156,20 @@ static void __init propagate_e820_map_node(int nid) */ static void __init allocate_pgdat(int nid) { - if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) + char buf[16]; + + if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { unsigned long pgdat_phys; pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); - reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), - "NODE_DATA"); + memset(buf, 0, sizeof(buf)); + sprintf(buf, "NODE_DATA %d", nid); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); } printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", nid, (unsigned long)NODE_DATA(nid)); -- cgit From cb95a13a8ace8612ecab042a838e5aab2ec14ef0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 2 Jul 2008 00:31:02 -0700 Subject: x86: merge zones_sizes_init for numa and non numa on 32-bit move out e820_register_active_regions from non numa zones_sizes_init() and remove numa version zones_sizes_init(). and let 32 bit call remove_all_active_ranges() in setup_arch() directly like 64-bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 -- arch/x86/mm/discontig_32.c | 16 ---------------- arch/x86/mm/init_32.c | 10 ++++------ include/asm-x86/page_32.h | 1 - 4 files changed, 4 insertions(+), 25 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4ac01d0ce624..d5de157b02ae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -749,10 +749,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); -#ifdef CONFIG_X86_64 /* Remove active ranges so rediscovery with NUMA-awareness happens */ remove_all_active_ranges(); -#endif #ifdef CONFIG_ACPI_NUMA /* diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index e2f0dd13a451..fa389d20383e 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -327,7 +327,6 @@ void __init initmem_init(unsigned long start_pfn, * and ZONE_HIGHMEM. */ - remove_all_active_ranges(); get_memcfg_numa(); kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); @@ -390,21 +389,6 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } -void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); - return; -} - void __init set_highmem_pages_init(void) { #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index aa5e37c9f4b4..8efe872b9617 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -660,12 +660,14 @@ void __init initmem_init(unsigned long start_pfn, if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; memory_present(0, 0, highend_pfn); + e820_register_active_regions(0, 0, highend_pfn); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else memory_present(0, 0, max_low_pfn); + e820_register_active_regions(0, 0, max_low_pfn); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif @@ -677,25 +679,21 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ -void __init zone_sizes_init(void) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - remove_all_active_ranges(); #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; - e820_register_active_regions(0, 0, highend_pfn); -#else - e820_register_active_regions(0, 0, max_low_pfn); #endif free_area_init_nodes(max_zone_pfns); } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ void __init setup_bootmem_allocator(void) { diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index 4ae1daba129b..ab8528793f08 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -96,7 +96,6 @@ extern void find_low_pfn_range(void); extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); extern void initmem_init(unsigned long, unsigned long); -extern void zone_sizes_init(void); extern void setup_bootmem_allocator(void); -- cgit From 698839fe04ae11f228074ad45614343c3921a6c6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 6 Jul 2008 11:42:11 -0700 Subject: x86: remove have_arch_parse_srat -v2 we already have the same srat handling interface for 32bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ---- arch/x86/mm/discontig_32.c | 17 ----------------- 2 files changed, 21 deletions(-) (limited to 'arch/x86/mm/discontig_32.c') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 67e5275f03c8..d8dddbc5e349 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -460,10 +460,6 @@ config ACPI_SRAT depends on X86_32 && ACPI && NUMA && X86_GENERICARCH select ACPI_NUMA -config HAVE_ARCH_PARSE_SRAT - def_bool y - depends on ACPI_SRAT - config X86_SUMMIT_NUMA def_bool y depends on X86_32 && NUMA && X86_GENERICARCH diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index fa389d20383e..5dfef9fa061a 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -443,20 +443,3 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#if defined(CONFIG_ACPI_NUMA) && !defined(CONFIG_HAVE_ARCH_PARSE_SRAT) -/* - * Dummy on 32-bit, for now: - */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ -} - -void __init acpi_numa_arch_fixup(void) -{ -} -#endif -- cgit