diff options
Diffstat (limited to 'arch/x86/kernel/alternative.c')
| -rw-r--r-- | arch/x86/kernel/alternative.c | 202 | 
1 files changed, 155 insertions, 47 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9a79c7808f9c..390596b761e3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only  #define pr_fmt(fmt) "SMP alternatives: " fmt  #include <linux/module.h> @@ -12,6 +13,7 @@  #include <linux/slab.h>  #include <linux/kdebug.h>  #include <linux/kprobes.h> +#include <linux/mmu_context.h>  #include <asm/text-patching.h>  #include <asm/alternative.h>  #include <asm/sections.h> @@ -264,7 +266,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)  extern struct alt_instr __alt_instructions[], __alt_instructions_end[];  extern s32 __smp_locks[], __smp_locks_end[]; -void *text_poke_early(void *addr, const void *opcode, size_t len); +void text_poke_early(void *addr, const void *opcode, size_t len);  /*   * Are we looking at a near JMP with a 1 or 4-byte displacement. @@ -666,16 +668,136 @@ void __init alternative_instructions(void)   * instructions. And on the local CPU you need to be protected again NMI or MCE   * handlers seeing an inconsistent instruction while you patch.   */ -void *__init_or_module text_poke_early(void *addr, const void *opcode, -					      size_t len) +void __init_or_module text_poke_early(void *addr, const void *opcode, +				      size_t len)  {  	unsigned long flags; + +	if (boot_cpu_has(X86_FEATURE_NX) && +	    is_module_text_address((unsigned long)addr)) { +		/* +		 * Modules text is marked initially as non-executable, so the +		 * code cannot be running and speculative code-fetches are +		 * prevented. Just change the code. +		 */ +		memcpy(addr, opcode, len); +	} else { +		local_irq_save(flags); +		memcpy(addr, opcode, len); +		local_irq_restore(flags); +		sync_core(); + +		/* +		 * Could also do a CLFLUSH here to speed up CPU recovery; but +		 * that causes hangs on some VIA CPUs. +		 */ +	} +} + +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + +static void *__text_poke(void *addr, const void *opcode, size_t len) +{ +	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; +	struct page *pages[2] = {NULL}; +	temp_mm_state_t prev; +	unsigned long flags; +	pte_t pte, *ptep; +	spinlock_t *ptl; +	pgprot_t pgprot; + +	/* +	 * While boot memory allocator is running we cannot use struct pages as +	 * they are not yet initialized. There is no way to recover. +	 */ +	BUG_ON(!after_bootmem); + +	if (!core_kernel_text((unsigned long)addr)) { +		pages[0] = vmalloc_to_page(addr); +		if (cross_page_boundary) +			pages[1] = vmalloc_to_page(addr + PAGE_SIZE); +	} else { +		pages[0] = virt_to_page(addr); +		WARN_ON(!PageReserved(pages[0])); +		if (cross_page_boundary) +			pages[1] = virt_to_page(addr + PAGE_SIZE); +	} +	/* +	 * If something went wrong, crash and burn since recovery paths are not +	 * implemented. +	 */ +	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); +  	local_irq_save(flags); -	memcpy(addr, opcode, len); + +	/* +	 * Map the page without the global bit, as TLB flushing is done with +	 * flush_tlb_mm_range(), which is intended for non-global PTEs. +	 */ +	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + +	/* +	 * The lock is not really needed, but this allows to avoid open-coding. +	 */ +	ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + +	/* +	 * This must not fail; preallocated in poking_init(). +	 */ +	VM_BUG_ON(!ptep); + +	pte = mk_pte(pages[0], pgprot); +	set_pte_at(poking_mm, poking_addr, ptep, pte); + +	if (cross_page_boundary) { +		pte = mk_pte(pages[1], pgprot); +		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); +	} + +	/* +	 * Loading the temporary mm behaves as a compiler barrier, which +	 * guarantees that the PTE will be set at the time memcpy() is done. +	 */ +	prev = use_temporary_mm(poking_mm); + +	kasan_disable_current(); +	memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); +	kasan_enable_current(); + +	/* +	 * Ensure that the PTE is only cleared after the instructions of memcpy +	 * were issued by using a compiler barrier. +	 */ +	barrier(); + +	pte_clear(poking_mm, poking_addr, ptep); +	if (cross_page_boundary) +		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + +	/* +	 * Loading the previous page-table hierarchy requires a serializing +	 * instruction that already allows the core to see the updated version. +	 * Xen-PV is assumed to serialize execution in a similar manner. +	 */ +	unuse_temporary_mm(prev); + +	/* +	 * Flushing the TLB might involve IPIs, which would require enabled +	 * IRQs, but not if the mm is not used, as it is in this point. +	 */ +	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + +			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE, +			   PAGE_SHIFT, false); + +	/* +	 * If the text does not match what we just wrote then something is +	 * fundamentally screwy; there's nothing we can really do about that. +	 */ +	BUG_ON(memcmp(addr, opcode, len)); + +	pte_unmap_unlock(ptep, ptl);  	local_irq_restore(flags); -	sync_core(); -	/* Could also do a CLFLUSH here to speed up CPU recovery; but -	   that causes hangs on some VIA CPUs. */  	return addr;  } @@ -689,48 +811,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,   * It means the size must be writable atomically and the address must be aligned   * in a way that permits an atomic write. It also makes sure we fit on a single   * page. + * + * Note that the caller must ensure that if the modified code is part of a + * module, the module would not be removed during poking. This can be achieved + * by registering a module notifier, and ordering module removal and patching + * trough a mutex.   */  void *text_poke(void *addr, const void *opcode, size_t len)  { -	unsigned long flags; -	char *vaddr; -	struct page *pages[2]; -	int i; - -	/* -	 * While boot memory allocator is runnig we cannot use struct -	 * pages as they are not yet initialized. -	 */ -	BUG_ON(!after_bootmem); -  	lockdep_assert_held(&text_mutex); -	if (!core_kernel_text((unsigned long)addr)) { -		pages[0] = vmalloc_to_page(addr); -		pages[1] = vmalloc_to_page(addr + PAGE_SIZE); -	} else { -		pages[0] = virt_to_page(addr); -		WARN_ON(!PageReserved(pages[0])); -		pages[1] = virt_to_page(addr + PAGE_SIZE); -	} -	BUG_ON(!pages[0]); -	local_irq_save(flags); -	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); -	if (pages[1]) -		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); -	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); -	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); -	clear_fixmap(FIX_TEXT_POKE0); -	if (pages[1]) -		clear_fixmap(FIX_TEXT_POKE1); -	local_flush_tlb(); -	sync_core(); -	/* Could also do a CLFLUSH here to speed up CPU recovery; but -	   that causes hangs on some VIA CPUs. */ -	for (i = 0; i < len; i++) -		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); -	local_irq_restore(flags); -	return addr; +	return __text_poke(addr, opcode, len); +} + +/** + * text_poke_kgdb - Update instructions on a live kernel by kgdb + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Context: should only be used by kgdb, which ensures no other core is running, + *	    despite the fact it does not hold the text_mutex. + */ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) +{ +	return __text_poke(addr, opcode, len);  }  static void do_sync_core(void *info) @@ -788,7 +898,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);   *	  replacing opcode   *	- sync cores   */ -void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)  {  	unsigned char int3 = 0xcc; @@ -830,7 +940,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)  	 * the writing of the new instruction.  	 */  	bp_patching_in_progress = false; - -	return addr;  }  |