From 7852ee068afe97eec3e955de3c4197aeb4793c52 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 13 Sep 2024 15:44:38 +0200 Subject: um: Remove unused os_process_pc The function is not used anywhere in the codebase. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240913134442.967599-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 1 - arch/um/os-Linux/process.c | 33 --------------------------------- 2 files changed, 34 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 9a039d6f1f74..c8c1a93c8d2c 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -199,7 +199,6 @@ extern int create_mem_file(unsigned long long len); extern void report_enomem(void); /* process.c */ -extern unsigned long os_process_pc(int pid); extern int os_process_parent(int pid); extern void os_alarm_process(int pid); extern void os_stop_process(int pid); diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index e52dd37ddadc..6b74a8d91e06 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -18,44 +18,11 @@ #include #include -#define ARBITRARY_ADDR -1 #define FAILURE_PID -1 #define STAT_PATH_LEN sizeof("/proc/#######/stat\0") #define COMM_SCANF "%*[^)])" -unsigned long os_process_pc(int pid) -{ - char proc_stat[STAT_PATH_LEN], buf[256]; - unsigned long pc = ARBITRARY_ADDR; - int fd, err; - - sprintf(proc_stat, "/proc/%d/stat", pid); - fd = open(proc_stat, O_RDONLY, 0); - if (fd < 0) { - printk(UM_KERN_ERR "os_process_pc - couldn't open '%s', " - "errno = %d\n", proc_stat, errno); - goto out; - } - CATCH_EINTR(err = read(fd, buf, sizeof(buf))); - if (err < 0) { - printk(UM_KERN_ERR "os_process_pc - couldn't read '%s', " - "err = %d\n", proc_stat, errno); - goto out_close; - } - os_close_file(fd); - pc = ARBITRARY_ADDR; - if (sscanf(buf, "%*d " COMM_SCANF " %*c %*d %*d %*d %*d %*d %*d %*d " - "%*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d " - "%*d %*d %*d %*d %*d %lu", &pc) != 1) - printk(UM_KERN_ERR "os_process_pc - couldn't find pc in '%s'\n", - buf); - out_close: - close(fd); - out: - return pc; -} - int os_process_parent(int pid) { char stat[STAT_PATH_LEN]; -- cgit From 47e174969cbf9244add188635a9590ff717d796e Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 13 Sep 2024 15:44:39 +0200 Subject: um: Remove unused os_process_parent The function is not used anywhere. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240913134442.967599-3-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 1 - arch/um/os-Linux/process.c | 39 --------------------------------------- 2 files changed, 40 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index c8c1a93c8d2c..4bdd4fb5dd80 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -199,7 +199,6 @@ extern int create_mem_file(unsigned long long len); extern void report_enomem(void); /* process.c */ -extern int os_process_parent(int pid); extern void os_alarm_process(int pid); extern void os_stop_process(int pid); extern void os_kill_process(int pid, int reap_child); diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 6b74a8d91e06..b25c226c5491 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -18,45 +18,6 @@ #include #include -#define FAILURE_PID -1 - -#define STAT_PATH_LEN sizeof("/proc/#######/stat\0") -#define COMM_SCANF "%*[^)])" - -int os_process_parent(int pid) -{ - char stat[STAT_PATH_LEN]; - char data[256]; - int parent = FAILURE_PID, n, fd; - - if (pid == -1) - return parent; - - snprintf(stat, sizeof(stat), "/proc/%d/stat", pid); - fd = open(stat, O_RDONLY, 0); - if (fd < 0) { - printk(UM_KERN_ERR "Couldn't open '%s', errno = %d\n", stat, - errno); - return parent; - } - - CATCH_EINTR(n = read(fd, data, sizeof(data))); - close(fd); - - if (n < 0) { - printk(UM_KERN_ERR "Couldn't read '%s', errno = %d\n", stat, - errno); - return parent; - } - - parent = FAILURE_PID; - n = sscanf(data, "%*d " COMM_SCANF " %*c %d", &parent); - if (n != 1) - printk(UM_KERN_ERR "Failed to scan '%s'\n", data); - - return parent; -} - void os_alarm_process(int pid) { kill(pid, SIGALRM); -- cgit From 377c23c5588d1f3d572b21d429dd95157bd8b5a9 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 13 Sep 2024 15:44:40 +0200 Subject: um: Remove unused os_stop_process The function is not used anywhere. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240913134442.967599-4-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 1 - arch/um/os-Linux/process.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 4bdd4fb5dd80..a94093bfa5e4 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -200,7 +200,6 @@ extern void report_enomem(void); /* process.c */ extern void os_alarm_process(int pid); -extern void os_stop_process(int pid); extern void os_kill_process(int pid, int reap_child); extern void os_kill_ptraced_process(int pid, int reap_child); diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index b25c226c5491..d1f5e6002805 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -23,11 +23,6 @@ void os_alarm_process(int pid) kill(pid, SIGALRM); } -void os_stop_process(int pid) -{ - kill(pid, SIGSTOP); -} - void os_kill_process(int pid, int reap_child) { kill(pid, SIGKILL); -- cgit From 71fae9dfa7e3a987886c8baf235d5a0b96e5dc15 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 13 Sep 2024 15:44:41 +0200 Subject: um: Remove unused os_getpgrp function The function is not used anywhere. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240913134442.967599-5-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 1 - arch/um/os-Linux/process.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index a94093bfa5e4..e54f64f55bb7 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -204,7 +204,6 @@ extern void os_kill_process(int pid, int reap_child); extern void os_kill_ptraced_process(int pid, int reap_child); extern int os_getpid(void); -extern int os_getpgrp(void); extern void init_new_thread_signals(void); diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index d1f5e6002805..f20602e793d9 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -53,11 +53,6 @@ int os_getpid(void) return syscall(__NR_getpid); } -int os_getpgrp(void) -{ - return getpgrp(); -} - int os_map_memory(void *virt, int fd, unsigned long long off, unsigned long len, int r, int w, int x) { -- cgit From 797d3688f98667006e12dbc657ca6ac29ea4a71b Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 13 Sep 2024 15:44:42 +0200 Subject: um: Set HAVE_EFFICIENT_UNALIGNED_ACCESS for x86 The x86 port of UM has efficient unaligned access. Set the option as it is appropriate and will e.g. cause UBSAN to not enable unaligned memory access checking by default. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240913134442.967599-6-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/x86/um/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 186f13268401..dc56c608ce69 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -10,6 +10,7 @@ config UML_X86 def_bool y select ARCH_BINFMT_ELF_EXTRA_PHDRS if X86_32 select DCACHE_WORD_ACCESS + select HAVE_EFFICIENT_UNALIGNED_ACCESS config 64BIT bool "64-bit kernel" if "$(SUBARCH)" = "x86" -- cgit From 855f6e18dff26fb13968a73998b0d0ff38865b51 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 16 Sep 2024 12:59:47 +0800 Subject: um: Remove the redundant declaration of high_physmem high_physmem has already been declared in as-layout.h, so there is no need to declare it explicitly in the .c file again. While at it, group the declarations of __real_malloc and __real_free together to make the code slightly more readable. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20240916045950.508910-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/os-Linux/main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index f98ff79cdbf7..cf1179ed1aec 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -182,6 +182,7 @@ int __init main(int argc, char **argv, char **envp) } extern void *__real_malloc(int); +extern void __real_free(void *); /* workaround for -Wmissing-prototypes warnings */ void *__wrap_malloc(int size); @@ -219,10 +220,6 @@ void *__wrap_calloc(int n, int size) return ptr; } -extern void __real_free(void *); - -extern unsigned long high_physmem; - void __wrap_free(void *ptr) { unsigned long addr = (unsigned long) ptr; -- cgit From a98b7761f697e590ed5d610d87fa12be66f23419 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 16 Sep 2024 12:59:48 +0800 Subject: um: Fix potential integer overflow during physmem setup This issue happens when the real map size is greater than LONG_MAX, which can be easily triggered on UML/i386. Fixes: fe205bdd1321 ("um: Print minimum physical memory requirement") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20240916045950.508910-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/physmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index fb2adfb49945..ee693e0b2b58 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -81,10 +81,10 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, unsigned long len, unsigned long long highmem) { unsigned long reserve = reserve_end - start; - long map_size = len - reserve; + unsigned long map_size = len - reserve; int err; - if(map_size <= 0) { + if (len <= reserve) { os_warn("Too few physical memory! Needed=%lu, given=%lu\n", reserve, len); exit(1); @@ -95,7 +95,7 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, err = os_map_memory((void *) reserve_end, physmem_fd, reserve, map_size, 1, 1, 1); if (err < 0) { - os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " + os_warn("setup_physmem - mapping %lu bytes of memory at 0x%p " "failed - errno = %d\n", map_size, (void *) reserve_end, err); exit(1); -- cgit From cd05cbed42b73168b0772b83bc24769fea3871bf Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 16 Sep 2024 12:59:49 +0800 Subject: um: Remove highmem leftovers Highmem was only supported on UML/i386. And the support has been removed by commit a98a6d864d3b ("um: Remove broken highmem support"). Remove the leftovers and stop UML from trying to setup highmem when the sum of physmem_size and iomem_size exceeds max_physmem. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20240916045950.508910-4-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/virtio_uml.c | 9 +-------- arch/um/include/shared/as-layout.h | 1 - arch/um/include/shared/mem_user.h | 5 ++--- arch/um/kernel/mem.c | 3 --- arch/um/kernel/physmem.c | 28 ++++++++++------------------ arch/um/kernel/um_arch.c | 17 +++++++---------- 6 files changed, 20 insertions(+), 43 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 2b6e701776b6..e7f5556e3c96 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -72,7 +72,7 @@ struct virtio_uml_vq_info { bool suspended; }; -extern unsigned long long physmem_size, highmem; +extern unsigned long long physmem_size; #define vu_err(vu_dev, ...) dev_err(&(vu_dev)->pdev->dev, ##__VA_ARGS__) @@ -673,13 +673,6 @@ static int vhost_user_set_mem_table(struct virtio_uml_device *vu_dev) if (rc < 0) return rc; - if (highmem) { - msg.payload.mem_regions.num++; - rc = vhost_user_init_mem_region(__pa(end_iomem), highmem, - &fds[1], &msg.payload.mem_regions.regions[1]); - if (rc < 0) - return rc; - } return vhost_user_send(vu_dev, false, &msg, fds, msg.payload.mem_regions.num); diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h index 06292fca5a4d..61965a06c18a 100644 --- a/arch/um/include/shared/as-layout.h +++ b/arch/um/include/shared/as-layout.h @@ -41,7 +41,6 @@ extern unsigned long uml_physmem; extern unsigned long uml_reserved; extern unsigned long end_vm; extern unsigned long start_vm; -extern unsigned long long highmem; extern unsigned long brk_start; diff --git a/arch/um/include/shared/mem_user.h b/arch/um/include/shared/mem_user.h index 11a723a58545..adfa08062f88 100644 --- a/arch/um/include/shared/mem_user.h +++ b/arch/um/include/shared/mem_user.h @@ -47,10 +47,9 @@ extern int iomem_size; #define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1)) extern unsigned long find_iomem(char *driver, unsigned long *len_out); -extern void mem_total_pages(unsigned long physmem, unsigned long iomem, - unsigned long highmem); +extern void mem_total_pages(unsigned long physmem, unsigned long iomem); extern void setup_physmem(unsigned long start, unsigned long usable, - unsigned long len, unsigned long long highmem); + unsigned long len); extern void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index a5b4fe2ad931..5026668dc054 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -51,8 +50,6 @@ EXPORT_SYMBOL(empty_zero_page); pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* Initialized at boot time, and readonly after that */ -unsigned long long highmem; -EXPORT_SYMBOL(highmem); int kmalloc_ok = 0; /* Used during early boot */ diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index ee693e0b2b58..94aca17993fd 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -24,17 +24,14 @@ EXPORT_SYMBOL(high_physmem); extern unsigned long long physmem_size; -void __init mem_total_pages(unsigned long physmem, unsigned long iomem, - unsigned long highmem) +void __init mem_total_pages(unsigned long physmem, unsigned long iomem) { - unsigned long phys_pages, highmem_pages; - unsigned long iomem_pages, total_pages; + unsigned long phys_pages, iomem_pages, total_pages; - phys_pages = physmem >> PAGE_SHIFT; - iomem_pages = iomem >> PAGE_SHIFT; - highmem_pages = highmem >> PAGE_SHIFT; + phys_pages = physmem >> PAGE_SHIFT; + iomem_pages = iomem >> PAGE_SHIFT; - total_pages = phys_pages + iomem_pages + highmem_pages; + total_pages = phys_pages + iomem_pages; max_mapnr = total_pages; } @@ -64,13 +61,12 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, * @reserve_end: end address of the physical kernel memory. * @len: Length of total physical memory that should be mapped/made * available, in bytes. - * @highmem: Number of highmem bytes that should be mapped/made available. * - * Creates an unlinked temporary file of size (len + highmem) and memory maps + * Creates an unlinked temporary file of size (len) and memory maps * it on the last executable image address (uml_reserved). * * The offset is needed as the length of the total physical memory - * (len + highmem) includes the size of the memory used be the executable image, + * (len) includes the size of the memory used be the executable image, * but the mapped-to address is the last address of the executable image * (uml_reserved == end address of executable image). * @@ -78,7 +74,7 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, * of all user space processes/kernel tasks. */ void __init setup_physmem(unsigned long start, unsigned long reserve_end, - unsigned long len, unsigned long long highmem) + unsigned long len) { unsigned long reserve = reserve_end - start; unsigned long map_size = len - reserve; @@ -90,7 +86,7 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, exit(1); } - physmem_fd = create_mem_file(len + highmem); + physmem_fd = create_mem_file(len); err = os_map_memory((void *) reserve_end, physmem_fd, reserve, map_size, 1, 1, 1); @@ -109,7 +105,7 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); os_fsync_file(physmem_fd); - memblock_add(__pa(start), len + highmem); + memblock_add(__pa(start), len); memblock_reserve(__pa(start), reserve); min_low_pfn = PFN_UP(__pa(reserve_end)); @@ -137,10 +133,6 @@ int phys_mapping(unsigned long phys, unsigned long long *offset_out) region = region->next; } } - else if (phys < __pa(end_iomem) + highmem) { - fd = physmem_fd; - *offset_out = phys - iomem_size; - } return fd; } diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 8e594cda6d77..8f86aa468b50 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -366,18 +366,15 @@ int __init linux_main(int argc, char **argv) setup_machinename(init_utsname()->machine); - highmem = 0; + physmem_size = (physmem_size + PAGE_SIZE - 1) & PAGE_MASK; iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK; + max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; - /* - * Zones have to begin on a 1 << MAX_PAGE_ORDER page boundary, - * so this makes sure that's true for highmem - */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_PAGE_ORDER)) - 1); if (physmem_size + iomem_size > max_physmem) { - highmem = physmem_size + iomem_size - max_physmem; - physmem_size -= highmem; + physmem_size = max_physmem - iomem_size; + os_info("Physical memory size shrunk to %llu bytes\n", + physmem_size); } high_physmem = uml_physmem + physmem_size; @@ -413,8 +410,8 @@ void __init setup_arch(char **cmdline_p) u8 rng_seed[32]; stack_protections((unsigned long) &init_thread_info); - setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); - mem_total_pages(physmem_size, iomem_size, highmem); + setup_physmem(uml_physmem, uml_reserved, physmem_size); + mem_total_pages(physmem_size, iomem_size); uml_dtb_init(); read_initrd(); -- cgit From 242fef3610e3a38c53781d4d81d8e779021c0af5 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 16 Sep 2024 12:59:50 +0800 Subject: um: Fix the definition for physmem_size Currently physmem_size is defined as long long but declared locally as unsigned long long before using it in separate .c files. Make them match by defining physmem_size as unsigned long long and also move the declaration to a common header to allow the compiler to check it. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20240916045950.508910-5-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/virtio_uml.c | 2 -- arch/um/include/shared/as-layout.h | 2 ++ arch/um/kernel/physmem.c | 2 -- arch/um/kernel/um_arch.c | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index e7f5556e3c96..4d3e9b9f5b61 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -72,8 +72,6 @@ struct virtio_uml_vq_info { bool suspended; }; -extern unsigned long long physmem_size; - #define vu_err(vu_dev, ...) dev_err(&(vu_dev)->pdev->dev, ##__VA_ARGS__) /* Vhost-user protocol */ diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h index 61965a06c18a..283226c34ca4 100644 --- a/arch/um/include/shared/as-layout.h +++ b/arch/um/include/shared/as-layout.h @@ -36,6 +36,8 @@ struct cpu_task { extern struct cpu_task cpu_tasks[]; +extern unsigned long long physmem_size; + extern unsigned long high_physmem; extern unsigned long uml_physmem; extern unsigned long uml_reserved; diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 94aca17993fd..636830fe00f2 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -22,8 +22,6 @@ static int physmem_fd = -1; unsigned long high_physmem; EXPORT_SYMBOL(high_physmem); -extern unsigned long long physmem_size; - void __init mem_total_pages(unsigned long physmem, unsigned long iomem) { unsigned long phys_pages, iomem_pages, total_pages; diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 8f86aa468b50..99cdf4b2d648 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -131,7 +131,7 @@ static int have_root __initdata; static int have_console __initdata; /* Set in uml_mem_setup and modified in linux_main */ -long long physmem_size = 64 * 1024 * 1024; +unsigned long long physmem_size = 64 * 1024 * 1024; EXPORT_SYMBOL(physmem_size); static const char *usage_string = -- cgit From 865e3845eeaa21e9a62abc1361644e67124f1ec0 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 13 Sep 2024 10:33:02 +0800 Subject: um: Fix the return value of elf_core_copy_task_fpregs This function is expected to return a boolean value, which should be true on success and false on failure. Fixes: d1254b12c93e ("uml: fix x86_64 core dump crash") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20240913023302.130300-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index be2856af6d4c..9c6cf03ed02b 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -292,6 +292,6 @@ int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) { int cpu = current_thread_info()->cpu; - return save_i387_registers(userspace_pid[cpu], (unsigned long *) fpu); + return save_i387_registers(userspace_pid[cpu], (unsigned long *) fpu) == 0; } -- cgit From 5a6951273e0e9dc0f79facf22281a8a731fb90b1 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 13 Sep 2024 15:38:45 +0200 Subject: um: always use the internal copy of the FP registers When switching from userspace to the kernel, all registers including the FP registers are copied into the kernel and restored later on. As such, the true source for the FP register state is actually already in the kernel and they should never be grabbed from the userspace process. Change the various places to simply copy the data from the internal FP register storage area. Note that on i386 the format of PTRACE_GETFPREGS and PTRACE_GETFPXREGS is different enough that conversion would be needed. With this patch, -EINVAL is returned if the non-native format is requested. The upside is, that this patchset fixes setting registers via ptrace (which simply did not work before) as well as fixing setting floating point registers using the mcontext on signal return on i386. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240913133845.964292-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/registers.h | 6 ---- arch/um/kernel/process.c | 11 +++++-- arch/x86/um/os-Linux/registers.c | 12 +++---- arch/x86/um/ptrace_32.c | 50 +++++++++++++++-------------- arch/x86/um/ptrace_64.c | 20 ++++-------- arch/x86/um/signal.c | 65 ++++++++++++++++++-------------------- 6 files changed, 80 insertions(+), 84 deletions(-) diff --git a/arch/um/include/shared/registers.h b/arch/um/include/shared/registers.h index a0450326521c..7d81b2339a48 100644 --- a/arch/um/include/shared/registers.h +++ b/arch/um/include/shared/registers.h @@ -8,12 +8,6 @@ #include -extern int save_i387_registers(int pid, unsigned long *fp_regs); -extern int restore_i387_registers(int pid, unsigned long *fp_regs); -extern int save_fp_registers(int pid, unsigned long *fp_regs); -extern int restore_fp_registers(int pid, unsigned long *fp_regs); -extern int save_fpx_registers(int pid, unsigned long *fp_regs); -extern int restore_fpx_registers(int pid, unsigned long *fp_regs); extern int init_pid_registers(int pid); extern void get_safe_registers(unsigned long *regs, unsigned long *fp_regs); extern int get_fp_registers(int pid, unsigned long *regs); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 9c6cf03ed02b..701b7bb2525e 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -290,8 +290,15 @@ unsigned long __get_wchan(struct task_struct *p) int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) { - int cpu = current_thread_info()->cpu; +#ifdef CONFIG_X86_32 + extern int have_fpx_regs; - return save_i387_registers(userspace_pid[cpu], (unsigned long *) fpu) == 0; + /* FIXME: A plain copy does not work on i386 with have_fpx_regs */ + if (have_fpx_regs) + return 0; +#endif + memcpy(fpu, &t->thread.regs.regs.fp, sizeof(*fpu)); + + return 1; } diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index f3638dd09cec..f8e5516d9708 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -19,14 +19,14 @@ static int have_xstate_support; -int save_i387_registers(int pid, unsigned long *fp_regs) +static int save_i387_registers(int pid, unsigned long *fp_regs) { if (ptrace(PTRACE_GETFPREGS, pid, 0, fp_regs) < 0) return -errno; return 0; } -int save_fp_registers(int pid, unsigned long *fp_regs) +static int save_fp_registers(int pid, unsigned long *fp_regs) { #ifdef PTRACE_GETREGSET struct iovec iov; @@ -42,14 +42,14 @@ int save_fp_registers(int pid, unsigned long *fp_regs) return save_i387_registers(pid, fp_regs); } -int restore_i387_registers(int pid, unsigned long *fp_regs) +static int restore_i387_registers(int pid, unsigned long *fp_regs) { if (ptrace(PTRACE_SETFPREGS, pid, 0, fp_regs) < 0) return -errno; return 0; } -int restore_fp_registers(int pid, unsigned long *fp_regs) +static int restore_fp_registers(int pid, unsigned long *fp_regs) { #ifdef PTRACE_SETREGSET struct iovec iov; @@ -66,14 +66,14 @@ int restore_fp_registers(int pid, unsigned long *fp_regs) #ifdef __i386__ int have_fpx_regs = 1; -int save_fpx_registers(int pid, unsigned long *fp_regs) +static int save_fpx_registers(int pid, unsigned long *fp_regs) { if (ptrace(PTRACE_GETFPXREGS, pid, 0, fp_regs) < 0) return -errno; return 0; } -int restore_fpx_registers(int pid, unsigned long *fp_regs) +static int restore_fpx_registers(int pid, unsigned long *fp_regs) { if (ptrace(PTRACE_SETFPXREGS, pid, 0, fp_regs) < 0) return -errno; diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index b0a71c6cdc6e..b8b85a52eb6f 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -168,17 +168,18 @@ int peek_user(struct task_struct *child, long addr, long data) return put_user(tmp, (unsigned long __user *) data); } +/* FIXME: Do the required conversions instead of erroring out */ +extern int have_fpx_regs; + static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int err, n, cpu = task_cpu(child); - struct user_i387_struct fpregs; + int n; - err = save_i387_registers(userspace_pid[cpu], - (unsigned long *) &fpregs); - if (err) - return err; + if (have_fpx_regs) + return -EINVAL; - n = copy_to_user(buf, &fpregs, sizeof(fpregs)); + n = copy_to_user(buf, &child->thread.regs.regs.fp, + sizeof(struct user_i387_struct)); if(n > 0) return -EFAULT; @@ -187,27 +188,28 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int n, cpu = task_cpu(child); - struct user_i387_struct fpregs; + int n; + + if (have_fpx_regs) + return -EINVAL; - n = copy_from_user(&fpregs, buf, sizeof(fpregs)); + n = copy_from_user(&child->thread.regs.regs.fp, buf, + sizeof(struct user_i387_struct)); if (n > 0) return -EFAULT; - return restore_i387_registers(userspace_pid[cpu], - (unsigned long *) &fpregs); + return 0; } static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int err, n, cpu = task_cpu(child); - struct user_fxsr_struct fpregs; + int n; - err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs); - if (err) - return err; + if (!have_fpx_regs) + return -EINVAL; - n = copy_to_user(buf, &fpregs, sizeof(fpregs)); + n = copy_to_user(buf, &child->thread.regs.regs.fp, + sizeof(struct user_fxsr_struct)); if(n > 0) return -EFAULT; @@ -216,15 +218,17 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct * static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int n, cpu = task_cpu(child); - struct user_fxsr_struct fpregs; + int n; - n = copy_from_user(&fpregs, buf, sizeof(fpregs)); + if (!have_fpx_regs) + return -EINVAL; + + n = copy_from_user(&child->thread.regs.regs.fp, buf, + sizeof(struct user_fxsr_struct)); if (n > 0) return -EFAULT; - return restore_fpx_registers(userspace_pid[cpu], - (unsigned long *) &fpregs); + return 0; } long subarch_ptrace(struct task_struct *child, long request, diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index aa68d83d3f44..f8bbad29cd0b 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -190,15 +190,10 @@ int peek_user(struct task_struct *child, long addr, long data) static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; - struct user_i387_struct fpregs; + int n; - err = save_i387_registers(userspace_pid[cpu], - (unsigned long *) &fpregs); - if (err) - return err; - - n = copy_to_user(buf, &fpregs, sizeof(fpregs)); + n = copy_to_user(buf, &child->thread.regs.regs.fp, + sizeof(struct user_i387_struct)); if (n > 0) return -EFAULT; @@ -207,15 +202,14 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; - struct user_i387_struct fpregs; + int n; - n = copy_from_user(&fpregs, buf, sizeof(fpregs)); + n = copy_from_user(&child->thread.regs.regs.fp, buf, + sizeof(struct user_i387_struct)); if (n > 0) return -EFAULT; - return restore_i387_registers(userspace_pid[cpu], - (unsigned long *) &fpregs); + return 0; } long subarch_ptrace(struct task_struct *child, long request, diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 2cc8c2309022..fa78c74e00cb 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -204,34 +204,30 @@ static int copy_sc_from_user(struct pt_regs *regs, #ifdef CONFIG_X86_32 if (have_fpx_regs) { - struct user_fxsr_struct fpx; - int pid = userspace_pid[current_thread_info()->cpu]; + struct user_fxsr_struct *fpx; + fpx = (void *)®s->regs.fp; - err = copy_from_user(&fpx, - &((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0], - sizeof(struct user_fxsr_struct)); + err = convert_fxsr_from_user(fpx, (void *)sc.fpstate); if (err) return 1; - - err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate); + } else { + BUILD_BUG_ON(sizeof(regs->regs.fp) > sizeof(struct _fpstate)); + err = copy_from_user(regs->regs.fp, (void *)sc.fpstate, + sizeof(regs->regs.fp)); if (err) return 1; - - err = restore_fpx_registers(pid, (unsigned long *) &fpx); - if (err < 0) { - printk(KERN_ERR "copy_sc_from_user - " - "restore_fpx_registers failed, errno = %d\n", - -err); - return 1; - } - } else -#endif + } +#else { + /* FIXME: Save/restore extended state (past the 16 YMM regs) */ + BUILD_BUG_ON(sizeof(regs->regs.fp) < sizeof(struct _xstate)); + err = copy_from_user(regs->regs.fp, (void *)sc.fpstate, sizeof(struct _xstate)); if (err) return 1; } +#endif return 0; } @@ -291,34 +287,35 @@ static int copy_sc_to_user(struct sigcontext __user *to, #ifdef CONFIG_X86_32 if (have_fpx_regs) { - int pid = userspace_pid[current_thread_info()->cpu]; - struct user_fxsr_struct fpx; + struct user_fxsr_struct *fpx; - err = save_fpx_registers(pid, (unsigned long *) &fpx); - if (err < 0){ - printk(KERN_ERR "copy_sc_to_user - save_fpx_registers " - "failed, errno = %d\n", err); - return 1; - } + fpx = (void *)®s->regs.fp; - err = convert_fxsr_to_user(&to_fp->fpstate, &fpx); + err = convert_fxsr_to_user(&to_fp->fpstate, fpx); if (err) return 1; - err |= __put_user(fpx.swd, &to_fp->fpstate.status); + err |= __put_user(fpx->swd, &to_fp->fpstate.status); err |= __put_user(X86_FXSR_MAGIC, &to_fp->fpstate.magic); if (err) return 1; - if (copy_to_user(&to_fp->fpstate._fxsr_env[0], &fpx, - sizeof(struct user_fxsr_struct))) - return 1; - } else -#endif - { - if (copy_to_user(to_fp, regs->regs.fp, sizeof(struct _xstate))) + } else { + if (copy_to_user(to_fp, regs->regs.fp, sizeof(regs->regs.fp))) return 1; + + /* Need to fill in the sw_reserved bits ... */ + BUILD_BUG_ON(offsetof(typeof(*to_fp), + fpstate.sw_reserved.magic1) >= + sizeof(struct _fpstate)); + __put_user(0, &to_fp->fpstate.sw_reserved.magic1); + __put_user(sizeof(struct _fpstate), + &to_fp->fpstate.sw_reserved.extended_size); } +#else + if (copy_to_user(to_fp, regs->regs.fp, sizeof(struct _xstate))) + return 1; +#endif return 0; } -- cgit From ed236fe4daf770ae10d6146bde0b00c39618a557 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Wed, 18 Sep 2024 14:17:02 +0800 Subject: um: Remove 3-level page table support on i386 The highmem support has been removed by commit a98a6d864d3b ("um: Remove broken highmem support"). The 2-level page table is sufficient on UML/i386 now. Remove the 3-level page table support on UML/i386 which is still marked as experimental. Suggested-by: Benjamin Berg Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20240918061702.614837-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/configs/i386_defconfig | 1 - arch/um/include/asm/page.h | 24 ------------------------ arch/um/include/asm/pgtable-3level.h | 9 --------- arch/x86/um/Kconfig | 10 +--------- 4 files changed, 1 insertion(+), 43 deletions(-) diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index e543cbac8792..0cc717d80be6 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -1,4 +1,3 @@ -CONFIG_3_LEVEL_PGTABLES=y # CONFIG_COMPACTION is not set CONFIG_BINFMT_MISC=m CONFIG_HOSTFS=y diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 9ef9a8aedfa6..8d2ac5e86cf5 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -32,28 +32,6 @@ struct page; #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT) - -typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned long pmd; } pmd_t; -typedef struct { unsigned long pgd; } pgd_t; -#define pte_val(p) ((p).pte) - -#define pte_get_bits(p, bits) ((p).pte & (bits)) -#define pte_set_bits(p, bits) ((p).pte |= (bits)) -#define pte_clear_bits(p, bits) ((p).pte &= ~(bits)) -#define pte_copy(to, from) ({ (to).pte = (from).pte; }) -#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) -#define pte_set_val(p, phys, prot) \ - ({ (p).pte = (phys) | pgprot_val(prot); }) - -#define pmd_val(x) ((x).pmd) -#define __pmd(x) ((pmd_t) { (x) } ) - -typedef unsigned long long phys_t; - -#else - typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pgd; } pgd_t; @@ -75,8 +53,6 @@ typedef struct { unsigned long pmd; } pmd_t; typedef unsigned long phys_t; -#endif - typedef struct { unsigned long pgprot; } pgprot_t; typedef struct page *pgtable_t; diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h index 8a5032ec231f..3504a92dc485 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-3level.h @@ -11,11 +11,7 @@ /* PGDIR_SHIFT determines what a third-level page table entry can map */ -#ifdef CONFIG_64BIT #define PGDIR_SHIFT 30 -#else -#define PGDIR_SHIFT 31 -#endif #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) @@ -32,13 +28,8 @@ */ #define PTRS_PER_PTE 512 -#ifdef CONFIG_64BIT #define PTRS_PER_PMD 512 #define PTRS_PER_PGD 512 -#else -#define PTRS_PER_PMD 1024 -#define PTRS_PER_PGD 1024 -#endif #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index dc56c608ce69..05cb7bb291e0 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -30,15 +30,7 @@ config X86_64 select MODULES_USE_ELF_RELA config 3_LEVEL_PGTABLES - bool "Three-level pagetables" if !64BIT - default 64BIT - help - Three-level pagetables will let UML have more than 4G of physical - memory. All the memory that can't be mapped directly will be treated - as high memory. - - However, this it experimental on 32-bit architectures, so if unsure say - N (on x86-64 it's automatically enabled, instead, as it's safe there). + def_bool 64BIT config ARCH_HAS_SC_SIGNALS def_bool !64BIT -- cgit From 48a858e0819ab9aad37d30cd2efadff928504021 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 24 Sep 2024 20:33:40 +0900 Subject: um: remove dependency on undefined CC_CAN_LINK_STATIC_NO_RUNTIME_DEPS CC_CAN_LINK_STATIC_NO_RUNTIME_DEPS is not defined anywhere. In the submitted patch set [1], the first patch "um/kconfig: introduce CC_CAN_LINK_STATIC_NO_RUNTIME_DEPS" was not applied. Only 2/3 and 3/3 were applied, which are now: - 730586ff7fad ("um: Allow static linking for non-glibc implementations") - 5e1121cd43d4 ("um: Some fixes to build UML with musl") Given that nobody has noticed the missing first patch for many years, it seems it was unnecessary. [1]: https://lore.kernel.org/lkml/20200719210222.2811-1-ignat@cloudflare.com/ Signed-off-by: Masahiro Yamada Link: https://patch.msgid.link/20240924113342.32530-1-masahiroy@kernel.org Signed-off-by: Johannes Berg --- arch/um/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index c89575d05021..de735d59b6c8 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -94,7 +94,7 @@ config MAY_HAVE_RUNTIME_DEPS config STATIC_LINK bool "Force a static link" - depends on CC_CAN_LINK_STATIC_NO_RUNTIME_DEPS || !MAY_HAVE_RUNTIME_DEPS + depends on !MAY_HAVE_RUNTIME_DEPS help This option gives you the ability to force a static link of UML. Normally, UML is linked as a shared binary. This is inconvenient for -- cgit From c6ce72005d1af98b983cc27aaa770afa66a1ca90 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sat, 5 Oct 2024 01:38:21 +0200 Subject: um: remove auxiliary FP registers We do not need the extra save/restore of the FP registers when getting the fault information. This was originally added in commit 2f56debd77a8 ("uml: fix FP register corruption") but at that time the code was not saving/restoring the FP registers when switching to userspace. This was fixed in commit fbfe9c847edf ("um: Save FPU registers between task switches") and since then the auxiliary registers have not been useful. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241004233821.2130874-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/asm/thread_info.h | 2 -- arch/um/include/shared/os.h | 2 +- arch/um/kernel/process.c | 4 ++-- arch/um/os-Linux/skas/process.c | 25 ++++++------------------- 4 files changed, 9 insertions(+), 24 deletions(-) diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index c7b4b49826a2..4d2a768246bc 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -23,8 +23,6 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ struct thread_info *real_thread; /* Points to non-IRQ stack */ - unsigned long aux_fp_regs[FP_SIZE]; /* auxiliary fp_regs to save/restore - them out-of-band */ }; #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index e54f64f55bb7..b511637e1f76 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -285,7 +285,7 @@ int protect(struct mm_id *mm_idp, unsigned long addr, /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); extern int start_userspace(unsigned long stub_stack); -extern void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs); +extern void userspace(struct uml_pt_regs *regs); extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)); extern void switch_threads(jmp_buf *me, jmp_buf *you); extern int start_idle_thread(void *stack, jmp_buf *switch_buf); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 701b7bb2525e..d45c79f82d7c 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -116,7 +116,7 @@ void new_thread_handler(void) * callback returns only if the kernel thread execs a process */ fn(arg); - userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); + userspace(¤t->thread.regs.regs); } /* Called magically, see new_thread_handler above */ @@ -133,7 +133,7 @@ static void fork_handler(void) current->thread.prev_sched = NULL; - userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); + userspace(¤t->thread.regs.regs); } int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index b6f656bcffb1..be666e02c44f 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -141,16 +141,10 @@ bad_wait: extern unsigned long current_stub_stack(void); -static void get_skas_faultinfo(int pid, struct faultinfo *fi, unsigned long *aux_fp_regs) +static void get_skas_faultinfo(int pid, struct faultinfo *fi) { int err; - err = get_fp_registers(pid, aux_fp_regs); - if (err < 0) { - printk(UM_KERN_ERR "save_fp_registers returned %d\n", - err); - fatal_sigsegv(); - } err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); if (err) { printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " @@ -164,18 +158,11 @@ static void get_skas_faultinfo(int pid, struct faultinfo *fi, unsigned long *aux * the stub stack page. We just have to copy it. */ memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); - - err = put_fp_registers(pid, aux_fp_regs); - if (err < 0) { - printk(UM_KERN_ERR "put_fp_registers returned %d\n", - err); - fatal_sigsegv(); - } } -static void handle_segv(int pid, struct uml_pt_regs *regs, unsigned long *aux_fp_regs) +static void handle_segv(int pid, struct uml_pt_regs *regs) { - get_skas_faultinfo(pid, ®s->faultinfo, aux_fp_regs); + get_skas_faultinfo(pid, ®s->faultinfo); segv(regs->faultinfo, 0, 1, NULL); } @@ -336,7 +323,7 @@ int start_userspace(unsigned long stub_stack) return err; } -void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) +void userspace(struct uml_pt_regs *regs) { int err, status, op, pid = userspace_pid[0]; siginfo_t si; @@ -435,11 +422,11 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) case SIGSEGV: if (PTRACE_FULL_FAULTINFO) { get_skas_faultinfo(pid, - ®s->faultinfo, aux_fp_regs); + ®s->faultinfo); (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, regs); } - else handle_segv(pid, regs, aux_fp_regs); + else handle_segv(pid, regs); break; case SIGTRAP + 0x80: handle_trap(pid, regs); -- cgit From cbb8e65e234e0139c0c516bb6b9110d210eecd3f Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:02 +0200 Subject: um: Add generic stub_syscall1 function The 64bit version did not have a stub_syscall1 function yet. Add it as it will be useful to implement a static binary for stub loading. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/x86/um/shared/sysdep/stub_64.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 67f44284f1aa..8e4ff39dcade 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -28,6 +28,17 @@ static __always_inline long stub_syscall0(long syscall) return ret; } +static __always_inline long stub_syscall1(long syscall, long arg1) +{ + long ret; + + __asm__ volatile (__syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1) : __syscall_clobber ); + + return ret; +} + static __always_inline long stub_syscall2(long syscall, long arg1, long arg2) { long ret; -- cgit From 32e8eaf263d9be014ba1970444f745682fa9c6c0 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:03 +0200 Subject: um: use execveat to create userspace MMs Using clone will not undo features that have been enabled by libc. An example of this already happening is rseq, which could cause the kernel to read/write memory of the userspace process. In the future the standard library might also use mseal by default to protect itself, which would also thwart our attempts at unmapping everything. Solve all this by taking a step back and doing an execve into a tiny static binary that sets up the minimal environment required for the stub without using any standard library. That way we have a clean execution environment that is fully under the control of UML. Note that this changes things a bit as the FDs are not anymore shared with the kernel. Instead, we explicitly share the FDs for the physical memory and all existing iomem regions. Doing this is fine, as iomem regions cannot be added at runtime. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-3-benjamin@sipsolutions.net [use pipe() instead of pipe2(), remove unneeded close() calls] Signed-off-by: Johannes Berg --- arch/um/Makefile | 3 +- arch/um/include/shared/skas/stub-data.h | 11 ++ arch/um/kernel/skas/.gitignore | 2 + arch/um/kernel/skas/Makefile | 33 +++++- arch/um/kernel/skas/stub_exe.c | 88 ++++++++++++++++ arch/um/kernel/skas/stub_exe_embed.S | 11 ++ arch/um/os-Linux/mem.c | 2 +- arch/um/os-Linux/skas/process.c | 177 ++++++++++++++++++++++---------- 8 files changed, 267 insertions(+), 60 deletions(-) create mode 100644 arch/um/kernel/skas/.gitignore create mode 100644 arch/um/kernel/skas/stub_exe.c create mode 100644 arch/um/kernel/skas/stub_exe_embed.S diff --git a/arch/um/Makefile b/arch/um/Makefile index 00b63bac5eff..31e367e8ab4d 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -61,7 +61,8 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \ $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap \ -Dlongjmp=kernel_longjmp -Dsetjmp=kernel_setjmp \ -Din6addr_loopback=kernel_in6addr_loopback \ - -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr + -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr \ + -D__close_range=kernel__close_range KBUILD_RUSTFLAGS += -Crelocation-model=pie diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 2b6b44759dfa..3fbdda727373 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -12,6 +12,17 @@ #include #include +struct stub_init_data { + unsigned long stub_start; + + int stub_code_fd; + unsigned long stub_code_offset; + int stub_data_fd; + unsigned long stub_data_offset; + + unsigned long segv_handler; +}; + #define STUB_NEXT_SYSCALL(s) \ ((struct stub_syscall *) (((unsigned long) s) + (s)->cmd_len)) diff --git a/arch/um/kernel/skas/.gitignore b/arch/um/kernel/skas/.gitignore new file mode 100644 index 000000000000..c3409ced0f38 --- /dev/null +++ b/arch/um/kernel/skas/.gitignore @@ -0,0 +1,2 @@ +stub_exe +stub_exe.dbg diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index 6f86d53e3d69..fbb61968055f 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -3,14 +3,43 @@ # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) # -obj-y := stub.o mmu.o process.o syscall.o uaccess.o +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \ + stub_exe_embed.o + +# Stub executable + +stub_exe_objs-y := stub_exe.o + +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F) + +# Object file containing the ELF executable +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe + +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE + $(call if_changed,stub_exe) + +$(obj)/stub_exe: OBJCOPYFLAGS := -S +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE + $(call if_changed,objcopy) + +quiet_cmd_stub_exe = STUB_EXE $@ + cmd_stub_exe = $(CC) -nostdlib -o $@ \ + $(KBUILD_CFLAGS) $(STUB_EXE_LDFLAGS) \ + $(filter %.o,$^) + +STUB_EXE_LDFLAGS = -n -static + +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) + +# end # stub.o is in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) -UNPROFILE_OBJS := stub.o +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) +UNPROFILE_OBJS := stub.o stub_exe.o KCOV_INSTRUMENT := n include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c new file mode 100644 index 000000000000..bc6ba2e4d805 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe.c @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include + +void _start(void); + +noinline static void real_init(void) +{ + struct stub_init_data init_data; + unsigned long res; + struct { + void *ss_sp; + int ss_flags; + size_t ss_size; + } stack = { + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + }; + struct { + void *sa_handler_; + unsigned long sa_flags; + void *sa_restorer; + unsigned long long sa_mask; + } sa = { + /* Need to set SA_RESTORER (but the handler never returns) */ + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, + /* no need to mask any signals */ + .sa_mask = 0, + }; + + /* set a nice name */ + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); + + /* read information from STDIN and close it */ + res = stub_syscall3(__NR_read, 0, + (unsigned long)&init_data, sizeof(init_data)); + if (res != sizeof(init_data)) + stub_syscall1(__NR_exit, 10); + + stub_syscall1(__NR_close, 0); + + /* map stub code + data */ + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED, + init_data.stub_code_fd, init_data.stub_code_offset); + if (res != init_data.stub_start) + stub_syscall1(__NR_exit, 11); + + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start + UM_KERN_PAGE_SIZE, + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, + init_data.stub_data_fd, init_data.stub_data_offset); + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) + stub_syscall1(__NR_exit, 12); + + /* setup signal stack inside stub data */ + stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); + + /* register SIGSEGV handler */ + sa.sa_handler_ = (void *) init_data.segv_handler; + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, + sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 13); + + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + + stub_syscall1(__NR_exit, 14); + + __builtin_unreachable(); +} + +void _start(void) +{ + char *alloc; + + /* Make enough space for the stub (including space for alignment) */ + alloc = __builtin_alloca((1 + 2 * STUB_DATA_PAGES - 1) * UM_KERN_PAGE_SIZE); + asm volatile("" : "+r,m"(alloc) : : "memory"); + + real_init(); +} diff --git a/arch/um/kernel/skas/stub_exe_embed.S b/arch/um/kernel/skas/stub_exe_embed.S new file mode 100644 index 000000000000..6d8914fbe8f1 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe_embed.S @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include + +__INITDATA + +SYM_DATA_START(stub_exe_start) + .incbin "arch/um/kernel/skas/stub_exe" +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end) + +__FINIT diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index cf44d386f23c..857e3deab293 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -42,7 +42,7 @@ void kasan_map_memory(void *start, size_t len) } /* Set by make_tempfile() during early boot. */ -static char *tempdir = NULL; +char *tempdir = NULL; /* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error. */ static int __init check_tmpfs(const char *dir) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index be666e02c44f..2286486bb0f3 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -10,8 +10,11 @@ #include #include #include +#include +#include #include #include +#include #include #include #include @@ -176,69 +179,131 @@ static void handle_trap(int pid, struct uml_pt_regs *regs) extern char __syscall_stub_start[]; -/** - * userspace_tramp() - userspace trampoline - * @stack: pointer to the new userspace stack page - * - * The userspace trampoline is used to setup a new userspace process in start_userspace() after it was clone()'ed. - * This function will run on a temporary stack page. - * It ptrace()'es itself, then - * Two pages are mapped into the userspace address space: - * - STUB_CODE (with EXEC), which contains the skas stub code - * - STUB_DATA (with R/W), which contains a data page that is used to transfer certain data between the UML userspace process and the UML kernel. - * Also for the userspace process a SIGSEGV handler is installed to catch pagefaults in the userspace process. - * And last the process stops itself to give control to the UML kernel for this userspace process. - * - * Return: Always zero, otherwise the current userspace process is ended with non null exit() call - */ +static int stub_exe_fd; + static int userspace_tramp(void *stack) { - struct sigaction sa; - void *addr; - int fd; + char *const argv[] = { "uml-userspace", NULL }; + int pipe_fds[2]; unsigned long long offset; - unsigned long segv_handler = STUB_CODE + - (unsigned long) stub_segv_handler - - (unsigned long) __syscall_stub_start; - - ptrace(PTRACE_TRACEME, 0, 0, 0); - - signal(SIGTERM, SIG_DFL); - signal(SIGWINCH, SIG_IGN); - - fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); - if (addr == MAP_FAILED) { - os_info("mapping mmap stub at 0x%lx failed, errno = %d\n", - STUB_CODE, errno); - exit(1); + struct stub_init_data init_data = { + .stub_start = STUB_START, + .segv_handler = STUB_CODE + + (unsigned long) stub_segv_handler - + (unsigned long) __syscall_stub_start, + }; + struct iomem_region *iomem; + int ret; + + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), + &offset); + init_data.stub_code_offset = MMAP_OFFSET(offset); + + init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); + init_data.stub_data_offset = MMAP_OFFSET(offset); + + /* Set CLOEXEC on all FDs and then unset on all memory related FDs */ + close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); + + fcntl(init_data.stub_data_fd, F_SETFD, 0); + for (iomem = iomem_regions; iomem; iomem = iomem->next) + fcntl(iomem->fd, F_SETFD, 0); + + /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ + if (pipe(pipe_fds)) + exit(2); + + if (dup2(pipe_fds[0], 0) < 0) + exit(3); + close(pipe_fds[0]); + + /* Write init_data and close write side */ + ret = write(pipe_fds[1], &init_data, sizeof(init_data)); + close(pipe_fds[1]); + + if (ret != sizeof(init_data)) + exit(4); + + execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH); + + exit(5); +} + +extern char stub_exe_start[]; +extern char stub_exe_end[]; + +extern char *tempdir; + +#define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" + +#ifndef MFD_EXEC +#define MFD_EXEC 0x0010U +#endif + +static int __init init_stub_exe_fd(void) +{ + size_t written = 0; + char *tmpfile = NULL; + + stub_exe_fd = memfd_create("uml-userspace", + MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); + + if (stub_exe_fd < 0) { + printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); + + tmpfile = malloc(strlen(tempdir) + + strlen(STUB_EXE_NAME_TEMPLATE) + 1); + if (tmpfile == NULL) + panic("Failed to allocate memory for stub binary name"); + + strcpy(tmpfile, tempdir); + strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); + + stub_exe_fd = mkstemp(tmpfile); + if (stub_exe_fd < 0) + panic("Could not create temporary file for stub binary: %d", + -errno); } - fd = phys_mapping(uml_to_phys(stack), &offset); - addr = mmap((void *) STUB_DATA, - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, fd, offset); - if (addr == MAP_FAILED) { - os_info("mapping segfault stack at 0x%lx failed, errno = %d\n", - STUB_DATA, errno); - exit(1); + while (written < stub_exe_end - stub_exe_start) { + ssize_t res = write(stub_exe_fd, stub_exe_start + written, + stub_exe_end - stub_exe_start - written); + if (res < 0) { + if (errno == EINTR) + continue; + + if (tmpfile) + unlink(tmpfile); + panic("Failed write stub binary: %d", -errno); + } + + written += res; } - set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; - sa.sa_sigaction = (void *) segv_handler; - sa.sa_restorer = NULL; - if (sigaction(SIGSEGV, &sa, NULL) < 0) { - os_info("%s - setting SIGSEGV handler failed - errno = %d\n", - __func__, errno); - exit(1); + if (!tmpfile) { + fcntl(stub_exe_fd, F_ADD_SEALS, + F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); + } else { + if (fchmod(stub_exe_fd, 00500) < 0) { + unlink(tmpfile); + panic("Could not make stub binary executable: %d", + -errno); + } + + close(stub_exe_fd); + stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); + if (stub_exe_fd < 0) { + unlink(tmpfile); + panic("Could not reopen stub binary: %d", -errno); + } + + unlink(tmpfile); + free(tmpfile); } - kill(os_getpid(), SIGSTOP); return 0; } +__initcall(init_stub_exe_fd); int userspace_pid[NR_CPUS]; @@ -257,7 +322,7 @@ int start_userspace(unsigned long stub_stack) { void *stack; unsigned long sp; - int pid, status, n, flags, err; + int pid, status, n, err; /* setup a temporary stack page */ stack = mmap(NULL, UM_KERN_PAGE_SIZE, @@ -273,10 +338,10 @@ int start_userspace(unsigned long stub_stack) /* set stack pointer to the end of the stack page, so it can grow downwards */ sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; - flags = CLONE_FILES | SIGCHLD; - /* clone into new userspace process */ - pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); + pid = clone(userspace_tramp, (void *) sp, + CLONE_VFORK | CLONE_VM | SIGCHLD, + (void *)stub_stack); if (pid < 0) { err = -errno; printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", -- cgit From 801e00d3a1b78b7f71675fae79946ff4aa3ee070 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:04 +0200 Subject: um: Set parent death signal for userspace process Enable PR_SET_PDEATHSIG so that the UML userspace process will be killed when the kernel exits unexpectedly. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-4-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/skas/stub_exe.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c index bc6ba2e4d805..04f75c577f1a 100644 --- a/arch/um/kernel/skas/stub_exe.c +++ b/arch/um/kernel/skas/stub_exe.c @@ -32,6 +32,9 @@ noinline static void real_init(void) /* set a nice name */ stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); + /* Make sure this process dies if the kernel dies */ + stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); + /* read information from STDIN and close it */ res = stub_syscall3(__NR_read, 0, (unsigned long)&init_data, sizeof(init_data)); -- cgit From fdb2ecd35d327a1fc6bba69b97f85b494e1f4b6b Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:05 +0200 Subject: um: Set parent death signal for winch thread/process The winch "thread" is really a separate process. Using prctl to set PR_SET_PDEATHSIG ensures that this separate thread will be killed if the UML kernel itself dies unexpectedly and does not perform proper cleanup. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-5-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/drivers/chan_user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index a66e556012c4..1434114b2f34 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "chan_user.h" #include #include @@ -161,6 +162,8 @@ static __noreturn int winch_thread(void *arg) int count; char c = 1; + prctl(PR_SET_PDEATHSIG, SIGKILL); + pty_fd = data->pty_fd; pipe_fd = data->pipe_fd; count = write(pipe_fd, &c, sizeof(c)); -- cgit From 77eb31b6003a158cce534fa9ca9df21fc82672ea Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:06 +0200 Subject: um: Add compile time assert that stub fits on a page The code assumes that the stub code can fit into a single page. This is unlikely to ever change, but add a link time assert instead so that there will be no hard to debug error. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-6-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/dyn.lds.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 3385d653ebd0..dc9d9a68af55 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -178,3 +178,6 @@ SECTIONS DISCARDS } + +ASSERT(__syscall_stub_end - __syscall_stub_start <= PAGE_SIZE, + "STUB code must not be larger than one page"); -- cgit From 91f0a0c5cc5bc863888a936fbd05394c6e284466 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:07 +0200 Subject: um: Calculate stub data address relative to stub code Instead of using the current stack pointer, we can also use the current instruction to calculate where the stub data is. With this the stub data only needs to be aligned to a full page boundary. Changing this has the advantage that we do not have a hole in the memory space above the stub data (which would need to be explicitly cleared). Another motivation to do this is that with the planned addition of a SECCOMP based userspace the stack pointer may not be fully trustworthy. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-7-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/um_arch.c | 6 ++---- arch/x86/um/shared/sysdep/stub_32.h | 10 +++++++--- arch/x86/um/shared/sysdep/stub_64.h | 8 +++++--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 99cdf4b2d648..427024f32b9f 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -325,10 +325,8 @@ int __init linux_main(int argc, char **argv) add_arg(DEFAULT_COMMAND_LINE_CONSOLE); host_task_size = os_get_top_address(); - /* reserve a few pages for the stubs (taking care of data alignment) */ - /* align the data portion */ - BUILD_BUG_ON(!is_power_of_2(STUB_DATA_PAGES)); - stub_start = (host_task_size - 1) & ~(STUB_DATA_PAGES * PAGE_SIZE - 1); + /* reserve a few pages for the stubs */ + stub_start = host_task_size - STUB_DATA_PAGES * PAGE_SIZE; /* another page for the code portion */ stub_start -= PAGE_SIZE; host_task_size = stub_start; diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index 0b44a86dd346..631a18d0ff44 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -112,10 +112,14 @@ static __always_inline void *get_stub_data(void) unsigned long ret; asm volatile ( - "movl %%esp,%0 ;" - "andl %1,%0" + "call _here_%=;" + "_here_%=:" + "popl %0;" + "andl %1, %0 ;" + "addl %2, %0 ;" : "=a" (ret) - : "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1))); + : "g" (~(UM_KERN_PAGE_SIZE - 1)), + "g" (UM_KERN_PAGE_SIZE)); return (void *)ret; } diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 8e4ff39dcade..17153dfd780a 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -117,10 +117,12 @@ static __always_inline void *get_stub_data(void) unsigned long ret; asm volatile ( - "movq %%rsp,%0 ;" - "andq %1,%0" + "lea 0(%%rip), %0;" + "andq %1, %0 ;" + "addq %2, %0 ;" : "=a" (ret) - : "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1))); + : "g" (~(UM_KERN_PAGE_SIZE - 1)), + "g" (UM_KERN_PAGE_SIZE)); return (void *)ret; } -- cgit From 830003c73d190259e45d0a99a0e3d14cb73e0af0 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:08 +0200 Subject: um: Limit TASK_SIZE to the addressable range We may have a TASK_SIZE from the host that is bigger than UML is able to address with a three-level pagetable on 64-bit. Guard against that by clipping the maximum TASK_SIZE to the maximum addressable area. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-8-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/um_arch.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 427024f32b9f..4452df4f2c4b 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -331,11 +331,16 @@ int __init linux_main(int argc, char **argv) stub_start -= PAGE_SIZE; host_task_size = stub_start; + /* Limit TASK_SIZE to what is addressable by the page table */ + task_size = host_task_size; + if (task_size > (unsigned long long) PTRS_PER_PGD * PGDIR_SIZE) + task_size = PTRS_PER_PGD * PGDIR_SIZE; + /* * TASK_SIZE needs to be PGDIR_SIZE aligned or else exit_mmap craps * out */ - task_size = host_task_size & PGDIR_MASK; + task_size = task_size & PGDIR_MASK; /* OS sanity checks that need to happen before the kernel runs */ os_early_checks(); -- cgit From 68b9883cc16ec2ce699d832ef60241b1a4e47d33 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:09 +0200 Subject: um: Discover host_task_size from envp When loading the UML binary, the host kernel will place the stack at the highest possible address. It will then map the program name and environment variables onto the start of the stack. As such, an easy way to figure out the host_task_size is to use the highest pointer to an environment variable as a reference. Ensure that this works by disabling address layout randomization and re-executing UML in case it was enabled. This increases the available TASK_SIZE for 64 bit UML considerably. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-9-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/as-layout.h | 2 +- arch/um/include/shared/os.h | 3 - arch/um/kernel/um_arch.c | 21 +++++- arch/um/os-Linux/main.c | 9 ++- arch/x86/um/os-Linux/Makefile | 2 +- arch/x86/um/os-Linux/task_size.c | 151 ------------------------------------- 6 files changed, 29 insertions(+), 159 deletions(-) delete mode 100644 arch/x86/um/os-Linux/task_size.c diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h index 283226c34ca4..d9679c911e54 100644 --- a/arch/um/include/shared/as-layout.h +++ b/arch/um/include/shared/as-layout.h @@ -49,7 +49,7 @@ extern unsigned long brk_start; extern unsigned long host_task_size; extern unsigned long stub_start; -extern int linux_main(int argc, char **argv); +extern int linux_main(int argc, char **argv, char **envp); extern void uml_finishsetup(void); struct siginfo; diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index b511637e1f76..bf539fee7831 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -325,9 +325,6 @@ extern int __ignore_sigio_fd(int fd); /* tty.c */ extern int get_pty(void); -/* sys-$ARCH/task_size.c */ -extern unsigned long os_get_top_address(void); - long syscall(long number, ...); /* irqflags tracing */ diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 4452df4f2c4b..38cbb41a64bc 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -302,7 +302,24 @@ static void parse_cache_line(char *line) } } -int __init linux_main(int argc, char **argv) +static unsigned long get_top_address(char **envp) +{ + unsigned long top_addr = (unsigned long) &top_addr; + int i; + + /* The earliest variable should be after the program name in ELF */ + for (i = 0; envp[i]; i++) { + if ((unsigned long) envp[i] > top_addr) + top_addr = (unsigned long) envp[i]; + } + + top_addr &= ~(UM_KERN_PAGE_SIZE - 1); + top_addr += UM_KERN_PAGE_SIZE; + + return top_addr; +} + +int __init linux_main(int argc, char **argv, char **envp) { unsigned long avail, diff; unsigned long virtmem_size, max_physmem; @@ -324,7 +341,7 @@ int __init linux_main(int argc, char **argv) if (have_console == 0) add_arg(DEFAULT_COMMAND_LINE_CONSOLE); - host_task_size = os_get_top_address(); + host_task_size = get_top_address(envp); /* reserve a few pages for the stubs */ stub_start = host_task_size - STUB_DATA_PAGES * PAGE_SIZE; /* another page for the code portion */ diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index cf1179ed1aec..8a52c49c5361 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,12 @@ int __init main(int argc, char **argv, char **envp) char **new_argv; int ret, i, err; + /* Disable randomization and re-exec if it was changed successfully */ + ret = personality(PER_LINUX | ADDR_NO_RANDOMIZE); + if (ret >= 0 && (ret & (PER_LINUX | ADDR_NO_RANDOMIZE)) != + (PER_LINUX | ADDR_NO_RANDOMIZE)) + execve("/proc/self/exe", argv, envp); + set_stklim(); setup_env_path(); @@ -140,7 +147,7 @@ int __init main(int argc, char **argv, char **envp) #endif change_sig(SIGPIPE, 0); - ret = linux_main(argc, argv); + ret = linux_main(argc, argv, envp); /* * Disable SIGPROF - I have no idea why libc doesn't do this or turn diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile index 5249bbc30dcd..77a308aaa5ec 100644 --- a/arch/x86/um/os-Linux/Makefile +++ b/arch/x86/um/os-Linux/Makefile @@ -3,7 +3,7 @@ # Licensed under the GPL # -obj-y = registers.o task_size.o mcontext.o +obj-y = registers.o mcontext.o obj-$(CONFIG_X86_32) += tls.o diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c deleted file mode 100644 index 1dc9adc20b1c..000000000000 --- a/arch/x86/um/os-Linux/task_size.c +++ /dev/null @@ -1,151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - -#ifdef __i386__ - -static jmp_buf buf; - -static void segfault(int sig) -{ - longjmp(buf, 1); -} - -static int page_ok(unsigned long page) -{ - unsigned long *address = (unsigned long *) (page << UM_KERN_PAGE_SHIFT); - unsigned long n = ~0UL; - void *mapped = NULL; - int ok = 0; - - /* - * First see if the page is readable. If it is, it may still - * be a VDSO, so we go on to see if it's writable. If not - * then try mapping memory there. If that fails, then we're - * still in the kernel area. As a sanity check, we'll fail if - * the mmap succeeds, but gives us an address different from - * what we wanted. - */ - if (setjmp(buf) == 0) - n = *address; - else { - mapped = mmap(address, UM_KERN_PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mapped == MAP_FAILED) - return 0; - if (mapped != address) - goto out; - } - - /* - * Now, is it writeable? If so, then we're in user address - * space. If not, then try mprotecting it and try the write - * again. - */ - if (setjmp(buf) == 0) { - *address = n; - ok = 1; - goto out; - } else if (mprotect(address, UM_KERN_PAGE_SIZE, - PROT_READ | PROT_WRITE) != 0) - goto out; - - if (setjmp(buf) == 0) { - *address = n; - ok = 1; - } - - out: - if (mapped != NULL) - munmap(mapped, UM_KERN_PAGE_SIZE); - return ok; -} - -unsigned long os_get_top_address(void) -{ - struct sigaction sa, old; - unsigned long bottom = 0; - /* - * A 32-bit UML on a 64-bit host gets confused about the VDSO at - * 0xffffe000. It is mapped, is readable, can be reprotected writeable - * and written. However, exec discovers later that it can't be - * unmapped. So, just set the highest address to be checked to just - * below it. This might waste some address space on 4G/4G 32-bit - * hosts, but shouldn't hurt otherwise. - */ - unsigned long top = 0xffffd000 >> UM_KERN_PAGE_SHIFT; - unsigned long test, original; - - printf("Locating the bottom of the address space ... "); - fflush(stdout); - - /* - * We're going to be longjmping out of the signal handler, so - * SA_DEFER needs to be set. - */ - sa.sa_handler = segfault; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_NODEFER; - if (sigaction(SIGSEGV, &sa, &old)) { - perror("os_get_top_address"); - exit(1); - } - - /* Manually scan the address space, bottom-up, until we find - * the first valid page (or run out of them). - */ - for (bottom = 0; bottom < top; bottom++) { - if (page_ok(bottom)) - break; - } - - /* If we've got this far, we ran out of pages. */ - if (bottom == top) { - fprintf(stderr, "Unable to determine bottom of address " - "space.\n"); - exit(1); - } - - printf("0x%lx\n", bottom << UM_KERN_PAGE_SHIFT); - printf("Locating the top of the address space ... "); - fflush(stdout); - - original = bottom; - - /* This could happen with a 4G/4G split */ - if (page_ok(top)) - goto out; - - do { - test = bottom + (top - bottom) / 2; - if (page_ok(test)) - bottom = test; - else - top = test; - } while (top - bottom > 1); - -out: - /* Restore the old SIGSEGV handling */ - if (sigaction(SIGSEGV, &old, NULL)) { - perror("os_get_top_address"); - exit(1); - } - top <<= UM_KERN_PAGE_SHIFT; - printf("0x%lx\n", top); - - return top; -} - -#else - -unsigned long os_get_top_address(void) -{ - /* The old value of CONFIG_TOP_ADDR */ - return 0x7fc0002000; -} - -#endif -- cgit From e167cc7a95fe01e228e403ac90090f8613e7d8bc Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:10 +0200 Subject: um: clear all memory in new userspace processes With the change to use execve() we can now safely clear the memory up to STUB_START as rseq will not be trying to use memory in that region. Also, on 64 bit the previous changes should mean that there is no usable memory range above the stub. Make the change and remove the comment as it is not needed anymore. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-10-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/skas/mmu.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 886ed5e65674..d3fb506d5bd6 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -40,29 +40,8 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) goto out_free; } - /* - * Ensure the new MM is clean and nothing unwanted is mapped. - * - * TODO: We should clear the memory up to STUB_START to ensure there is - * nothing mapped there, i.e. we (currently) have: - * - * |- user memory -|- unused -|- stub -|- unused -| - * ^ TASK_SIZE ^ STUB_START - * - * Meaning we have two unused areas where we may still have valid - * mappings from our internal clone(). That isn't really a problem as - * userspace is not going to access them, but it is definitely not - * correct. - * - * However, we are "lucky" and if rseq is configured, then on 32 bit - * it will fall into the first empty range while on 64 bit it is going - * to use an anonymous mapping in the second range. As such, things - * continue to work for now as long as we don't start unmapping these - * areas. - * - * Change this to STUB_START once we have a clean userspace. - */ - unmap(new_id, 0, TASK_SIZE); + /* Ensure the new MM is clean and nothing unwanted is mapped */ + unmap(new_id, 0, STUB_START); return 0; -- cgit From 41ab5fe7471ff38d2909d1c93b88197a89c6a00f Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 19 Sep 2024 14:45:11 +0200 Subject: um: Switch to 4 level page tables on 64 bit The larger memory space is useful to support more applications inside UML. One example for this is ASAN instrumentation of userspace applications which requires addresses that would otherwise not be available. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240919124511.282088-11-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/Kconfig | 4 +- arch/um/include/asm/page.h | 14 ++++- arch/um/include/asm/pgalloc.h | 11 +++- arch/um/include/asm/pgtable-3level.h | 91 --------------------------- arch/um/include/asm/pgtable-4level.h | 119 +++++++++++++++++++++++++++++++++++ arch/um/include/asm/pgtable.h | 8 ++- arch/um/kernel/mem.c | 17 ++++- arch/x86/um/Kconfig | 3 - 8 files changed, 163 insertions(+), 104 deletions(-) delete mode 100644 arch/um/include/asm/pgtable-3level.h create mode 100644 arch/um/include/asm/pgtable-4level.h diff --git a/arch/um/Kconfig b/arch/um/Kconfig index de735d59b6c8..448454a3d8b5 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -209,8 +209,8 @@ config MMAPPER config PGTABLE_LEVELS int - default 3 if 3_LEVEL_PGTABLES - default 2 + default 4 if 64BIT + default 2 if !64BIT config UML_TIME_TRAVEL_SUPPORT bool diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 8d2ac5e86cf5..f0ad80fc8c10 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -35,14 +35,22 @@ struct page; typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pgd; } pgd_t; -#ifdef CONFIG_3_LEVEL_PGTABLES +#if CONFIG_PGTABLE_LEVELS > 2 + typedef struct { unsigned long pmd; } pmd_t; #define pmd_val(x) ((x).pmd) #define __pmd(x) ((pmd_t) { (x) } ) -#endif -#define pte_val(x) ((x).pte) +#if CONFIG_PGTABLE_LEVELS > 3 +typedef struct { unsigned long pud; } pud_t; +#define pud_val(x) ((x).pud) +#define __pud(x) ((pud_t) { (x) } ) + +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ + +#define pte_val(x) ((x).pte) #define pte_get_bits(p, bits) ((p).pte & (bits)) #define pte_set_bits(p, bits) ((p).pte |= (bits)) diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index de5e31c64793..04fb4e6969a4 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -31,7 +31,7 @@ do { \ tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) -#ifdef CONFIG_3_LEVEL_PGTABLES +#if CONFIG_PGTABLE_LEVELS > 2 #define __pmd_free_tlb(tlb, pmd, address) \ do { \ @@ -39,6 +39,15 @@ do { \ tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ } while (0) +#if CONFIG_PGTABLE_LEVELS > 3 + +#define __pud_free_tlb(tlb, pud, address) \ +do { \ + pagetable_pud_dtor(virt_to_ptdesc(pud)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \ +} while (0) + +#endif #endif #endif diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h deleted file mode 100644 index 3504a92dc485..000000000000 --- a/arch/um/include/asm/pgtable-3level.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright 2003 PathScale Inc - * Derived from include/asm-i386/pgtable.h - */ - -#ifndef __UM_PGTABLE_3LEVEL_H -#define __UM_PGTABLE_3LEVEL_H - -#include - -/* PGDIR_SHIFT determines what a third-level page table entry can map */ - -#define PGDIR_SHIFT 30 -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* PMD_SHIFT determines the size of the area a second-level page table can - * map - */ - -#define PMD_SHIFT 21 -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) - -/* - * entries per page directory level - */ - -#define PTRS_PER_PTE 512 -#define PTRS_PER_PMD 512 -#define PTRS_PER_PGD 512 - -#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) - -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ - pte_val(e)) -#define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ - pmd_val(e)) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ - pgd_val(e)) - -#define pud_none(x) (!(pud_val(x) & ~_PAGE_NEWPAGE)) -#define pud_bad(x) ((pud_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) -#define pud_present(x) (pud_val(x) & _PAGE_PRESENT) -#define pud_populate(mm, pud, pmd) \ - set_pud(pud, __pud(_PAGE_TABLE + __pa(pmd))) - -#define set_pud(pudptr, pudval) (*(pudptr) = (pudval)) - -static inline int pgd_newpage(pgd_t pgd) -{ - return(pgd_val(pgd) & _PAGE_NEWPAGE); -} - -static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; } - -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) - -static inline void pud_clear (pud_t *pud) -{ - set_pud(pud, __pud(_PAGE_NEWPAGE)); -} - -#define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK) -#define pud_pgtable(pud) ((pmd_t *) __va(pud_val(pud) & PAGE_MASK)) - -static inline unsigned long pte_pfn(pte_t pte) -{ - return phys_to_pfn(pte_val(pte)); -} - -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) -{ - pte_t pte; - phys_t phys = pfn_to_phys(page_nr); - - pte_set_val(pte, phys, pgprot); - return pte; -} - -static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) -{ - return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); -} - -#endif - diff --git a/arch/um/include/asm/pgtable-4level.h b/arch/um/include/asm/pgtable-4level.h new file mode 100644 index 000000000000..f912fcc16b7a --- /dev/null +++ b/arch/um/include/asm/pgtable-4level.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2003 PathScale Inc + * Derived from include/asm-i386/pgtable.h + */ + +#ifndef __UM_PGTABLE_4LEVEL_H +#define __UM_PGTABLE_4LEVEL_H + +#include + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ + +#define PGDIR_SHIFT 39 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +/* PUD_SHIFT determines the size of the area a third-level page table can + * map + */ + +#define PUD_SHIFT 30 +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PMD_SHIFT determines the size of the area a second-level page table can + * map + */ + +#define PMD_SHIFT 21 +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) + +/* + * entries per page directory level + */ + +#define PTRS_PER_PTE 512 +#define PTRS_PER_PMD 512 +#define PTRS_PER_PUD 512 +#define PTRS_PER_PGD 512 + +#define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pte_val(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pmd_val(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pud_val(e)) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pgd_val(e)) + +#define pud_none(x) (!(pud_val(x) & ~_PAGE_NEWPAGE)) +#define pud_bad(x) ((pud_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pud_present(x) (pud_val(x) & _PAGE_PRESENT) +#define pud_populate(mm, pud, pmd) \ + set_pud(pud, __pud(_PAGE_TABLE + __pa(pmd))) + +#define set_pud(pudptr, pudval) (*(pudptr) = (pudval)) + +#define p4d_none(x) (!(p4d_val(x) & ~_PAGE_NEWPAGE)) +#define p4d_bad(x) ((p4d_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define p4d_present(x) (p4d_val(x) & _PAGE_PRESENT) +#define p4d_populate(mm, p4d, pud) \ + set_p4d(p4d, __p4d(_PAGE_TABLE + __pa(pud))) + +#define set_p4d(p4dptr, p4dval) (*(p4dptr) = (p4dval)) + + +static inline int pgd_newpage(pgd_t pgd) +{ + return(pgd_val(pgd) & _PAGE_NEWPAGE); +} + +static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; } + +#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) + +static inline void pud_clear (pud_t *pud) +{ + set_pud(pud, __pud(_PAGE_NEWPAGE)); +} + +static inline void p4d_clear (p4d_t *p4d) +{ + set_p4d(p4d, __p4d(_PAGE_NEWPAGE)); +} + +#define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK) +#define pud_pgtable(pud) ((pmd_t *) __va(pud_val(pud) & PAGE_MASK)) + +#define p4d_page(p4d) phys_to_page(p4d_val(p4d) & PAGE_MASK) +#define p4d_pgtable(p4d) ((pud_t *) __va(p4d_val(p4d) & PAGE_MASK)) + +static inline unsigned long pte_pfn(pte_t pte) +{ + return phys_to_pfn(pte_val(pte)); +} + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + pte_t pte; + phys_t phys = pfn_to_phys(page_nr); + + pte_set_val(pte, phys, pgprot); + return pte; +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); +} + +#endif diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 83373c9963e7..bd7a9593705f 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -24,10 +24,12 @@ /* We borrow bit 10 to store the exclusive marker in swap PTEs. */ #define _PAGE_SWP_EXCLUSIVE 0x400 -#ifdef CONFIG_3_LEVEL_PGTABLES -#include -#else +#if CONFIG_PGTABLE_LEVELS == 4 +#include +#elif CONFIG_PGTABLE_LEVELS == 2 #include +#else +#error "Unsupported number of page table levels" #endif extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 5026668dc054..53248ed04771 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -95,7 +95,7 @@ static void __init one_page_table_init(pmd_t *pmd) static void __init one_md_table_init(pud_t *pud) { -#ifdef CONFIG_3_LEVEL_PGTABLES +#if CONFIG_PGTABLE_LEVELS > 2 pmd_t *pmd_table = (pmd_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!pmd_table) panic("%s: Failed to allocate %lu bytes align=%lx\n", @@ -106,6 +106,19 @@ static void __init one_md_table_init(pud_t *pud) #endif } +static void __init one_ud_table_init(p4d_t *p4d) +{ +#if CONFIG_PGTABLE_LEVELS > 3 + pud_t *pud_table = (pud_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + if (!pud_table) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + + set_p4d(p4d, __p4d(_KERNPG_TABLE + (unsigned long) __pa(pud_table))); + BUG_ON(pud_table != pud_offset(p4d, 0)); +#endif +} + static void __init fixrange_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { @@ -123,6 +136,8 @@ static void __init fixrange_init(unsigned long start, unsigned long end, for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) + one_ud_table_init(p4d); pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) one_md_table_init(pud); diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 05cb7bb291e0..986045d5e638 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -29,9 +29,6 @@ config X86_64 def_bool 64BIT select MODULES_USE_ELF_RELA -config 3_LEVEL_PGTABLES - def_bool 64BIT - config ARCH_HAS_SC_SIGNALS def_bool !64BIT -- cgit From 89350defd1f0eb5a58a7e1155d9e322080f0bf15 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 16 Oct 2024 14:12:37 -0700 Subject: um: Fix passing '-n' to linker for stub_exe When building stub_exe with clang, there is an error because '-n' is not a recognized flag by the clang driver (which is being used to invoke the linker): clang: error: unknown argument: '-n' '-n' should be passed along to the linker, as it is the short flag for '--nmagic', so prefix it with '-Wl,'. Fixes: 32e8eaf263d9 ("um: use execveat to create userspace MMs") Signed-off-by: Nathan Chancellor Link: https://patch.msgid.link/20241016-uml-fix-stub_exe-clang-v1-1-3d6381dc5a78@kernel.org Signed-off-by: Johannes Berg --- arch/um/kernel/skas/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index fbb61968055f..f93db893b823 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -27,7 +27,7 @@ quiet_cmd_stub_exe = STUB_EXE $@ $(KBUILD_CFLAGS) $(STUB_EXE_LDFLAGS) \ $(filter %.o,$^) -STUB_EXE_LDFLAGS = -n -static +STUB_EXE_LDFLAGS = -Wl,-n -static targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) -- cgit From 1e3071d629b2e2cd7faeb8de2f88ba31cfd7231a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 16 Oct 2024 14:12:38 -0700 Subject: um: Disable auto variable initialization for stub_exe.c When automatic variable initialization is enabled via CONFIG_INIT_STACK_ALL_{PATTERN,ZERO}, clang will insert a call to memset() to initialize an object created with __builtin_alloca(). This ultimately breaks the build when linking stub_exe because it is a standalone executable that does not include or link against memset(). ld: arch/um/kernel/skas/stub_exe.o: in function `_start': arch/um/kernel/skas/stub_exe.c:83:(.ltext+0x15): undefined reference to `memset' Disable automatic variable initialization for stub_exe.c by passing the default value of 'uninitialized' to '-ftrivial-auto-var-init', which avoids generating the call to memset(). This code is small and runs quickly as it is just designed to set up an environment, so stack variable initialization is unnecessary overhead for little gain. Fixes: 32e8eaf263d9 ("um: use execveat to create userspace MMs") Signed-off-by: Nathan Chancellor Link: https://patch.msgid.link/20241016-uml-fix-stub_exe-clang-v1-2-3d6381dc5a78@kernel.org Signed-off-by: Johannes Berg --- arch/um/kernel/skas/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index f93db893b823..f6a219074772 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -39,6 +39,11 @@ targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) + +# Clang will call memset() from __builtin_alloca() when stack variable +# initialization is enabled, which is used in stub_exe.c. +CFLAGS_stub_exe.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized) + UNPROFILE_OBJS := stub.o stub_exe.o KCOV_INSTRUMENT := n -- cgit From 8508a5e0e9db3932ca43651f86ba1042a1e9f4ca Mon Sep 17 00:00:00 2001 From: David Gow Date: Fri, 18 Oct 2024 07:10:08 +0800 Subject: um: Fix misaligned stack in stub_exe The stub_exe could segfault when built with some compilers (e.g. gcc 13.2.0), as SSE instructions which relied on stack alignment could be generated, but the stack was misaligned. This seems to be due to the __start entry point being run with a 16-byte aligned stack, but the x86_64 SYSV ABI wanting the stack to be so aligned _before_ a function call (so it is misaligned when the function is entered due to the return address being pushed). The function prologue then realigns it. Because the entry point is never _called_, and hence there is no return address, the prologue is therefore actually misaligning it, and causing the generated movaps instructions to SIGSEGV. This results in the following error: start_userspace : expected SIGSTOP, got status = 139 Don't generate this prologue for __start by using __attribute__((naked)), which resolves the issue. Fixes: 32e8eaf263d9 ("um: use execveat to create userspace MMs") Signed-off-by: David Gow Link: https://lore.kernel.org/linux-um/CABVgOS=boUoG6=LHFFhxEd8H8jDP1zOaPKFEjH+iy2n2Q5S2aQ@mail.gmail.com/ Link: https://patch.msgid.link/20241017231007.1500497-2-davidgow@google.com Signed-off-by: Johannes Berg --- arch/um/kernel/skas/stub_exe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c index 04f75c577f1a..722ce6267476 100644 --- a/arch/um/kernel/skas/stub_exe.c +++ b/arch/um/kernel/skas/stub_exe.c @@ -79,7 +79,7 @@ noinline static void real_init(void) __builtin_unreachable(); } -void _start(void) +__attribute__((naked)) void _start(void) { char *alloc; -- cgit From 14d4a7b516e993cf3926758a7ede569d8e119855 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 22 Oct 2024 14:02:38 +0200 Subject: um: make stub_exe _start() pure inline asm Since __attribute__((naked)) cannot be used with functions containing C statements, just generate the few instructions it needs in assembly directly. While at it, fix the stack usage ("1 + 2*x - 1" is odd) and document what it must do, and why it must adjust the stack. Fixes: 8508a5e0e9db ("um: Fix misaligned stack in stub_exe") Link: https://lore.kernel.org/linux-um/CABVgOSntH-uoOFMP5HwMXjx_f1osMnVdhgKRKm4uz6DFm2Lb8Q@mail.gmail.com/ Reviewed-by: David Gow Signed-off-by: Johannes Berg --- arch/um/kernel/skas/stub_exe.c | 18 +++++++++++------- arch/x86/um/shared/sysdep/stub_32.h | 8 ++++++++ arch/x86/um/shared/sysdep/stub_64.h | 8 ++++++++ 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c index 722ce6267476..23c99b285e82 100644 --- a/arch/um/kernel/skas/stub_exe.c +++ b/arch/um/kernel/skas/stub_exe.c @@ -81,11 +81,15 @@ noinline static void real_init(void) __attribute__((naked)) void _start(void) { - char *alloc; - - /* Make enough space for the stub (including space for alignment) */ - alloc = __builtin_alloca((1 + 2 * STUB_DATA_PAGES - 1) * UM_KERN_PAGE_SIZE); - asm volatile("" : "+r,m"(alloc) : : "memory"); - - real_init(); + /* + * Since the stack after exec() starts at the top-most address, + * but that's exactly where we also want to map the stub data + * and code, this must: + * - push the stack by 1 code and STUB_DATA_PAGES data pages + * - call real_init() + * This way, real_init() can use the stack normally, while the + * original stack further down (higher address) will become + * inaccessible after the mmap() calls above. + */ + stub_start(real_init); } diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index 631a18d0ff44..390988132c0a 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -123,4 +123,12 @@ static __always_inline void *get_stub_data(void) return (void *)ret; } + +#define stub_start(fn) \ + asm volatile ( \ + "subl %0,%%esp ;" \ + "movl %1, %%eax ; " \ + "call *%%eax ;" \ + :: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \ + "i" (&fn)) #endif diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 17153dfd780a..294affbec742 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -126,4 +126,12 @@ static __always_inline void *get_stub_data(void) return (void *)ret; } + +#define stub_start(fn) \ + asm volatile ( \ + "subq %0,%%rsp ;" \ + "movq %1,%%rax ;" \ + "call *%%rax ;" \ + :: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \ + "i" (&fn)) #endif -- cgit From 031acdcfb566ba18ffb57d51abf357a5e350424b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Oct 2024 16:14:12 +0200 Subject: um: restore process name After the execve() to disable ASLR, comm is now "exe", which is a bit confusing. Use readlink() to get this to the right name again. Disable stack frame size warnings on main.o since it's part of the initial userspace and can use larger stack. Fixes: 68b9883cc16e ("um: Discover host_task_size from envp") Link: https://patch.msgid.link/20241010161411.c576e2aeb3e5.I244d4f34b8a8555ee5bec0e1cf5027bce4cc491b@changeid Signed-off-by: Johannes Berg --- arch/um/os-Linux/Makefile | 2 ++ arch/um/os-Linux/main.c | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 544e0b344c75..049dfa5bc9c6 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -12,6 +12,8 @@ obj-y = execvp.o file.o helper.o irq.o main.o mem.o process.o \ CFLAGS_signal.o += -Wframe-larger-than=4096 +CFLAGS_main.o += -Wno-frame-larger-than + obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \ diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index 8a52c49c5361..5e0cba5aee93 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -112,8 +113,17 @@ int __init main(int argc, char **argv, char **envp) /* Disable randomization and re-exec if it was changed successfully */ ret = personality(PER_LINUX | ADDR_NO_RANDOMIZE); if (ret >= 0 && (ret & (PER_LINUX | ADDR_NO_RANDOMIZE)) != - (PER_LINUX | ADDR_NO_RANDOMIZE)) - execve("/proc/self/exe", argv, envp); + (PER_LINUX | ADDR_NO_RANDOMIZE)) { + char buf[PATH_MAX] = {}; + ssize_t ret; + + ret = readlink("/proc/self/exe", buf, sizeof(buf)); + if (ret < 0 || ret >= sizeof(buf)) { + perror("readlink failure"); + exit(1); + } + execve(buf, argv, envp); + } set_stklim(); -- cgit From 188b64f288a434bed3ef21ec59f00c996ecb0346 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Oct 2024 22:45:14 +0200 Subject: um: remove fault_catcher infrastructure This was perhaps intended to do _nofault copies, but the real reason is lost to history. Remove this, it's not needed, and using longjmp() out of the middle of the signal handler with all the state it has modified is not going to be a good idea anyway. Link: https://patch.msgid.link/20241010224513.901c4d390b3e.Ia74742668b44603c1ca23dd36f90e964e6e7ee55@changeid Signed-off-by: Johannes Berg --- arch/um/include/asm/processor-generic.h | 3 --- arch/um/include/shared/kern_util.h | 1 - arch/um/kernel/trap.c | 16 ---------------- arch/um/os-Linux/signal.c | 2 +- 4 files changed, 1 insertion(+), 21 deletions(-) diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index bce4595798da..02e759a4a435 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -22,8 +22,6 @@ struct mm_struct; struct thread_struct { struct pt_regs regs; struct pt_regs *segv_regs; - void *fault_addr; - jmp_buf *fault_catcher; struct task_struct *prev_sched; struct arch_thread arch; jmp_buf switch_buf; @@ -38,7 +36,6 @@ struct thread_struct { #define INIT_THREAD \ { \ .regs = EMPTY_REGS, \ - .fault_addr = NULL, \ .prev_sched = NULL, \ .arch = INIT_ARCH_THREAD, \ .request = { } \ diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index d8ffd2db168e..f21dc8517538 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -60,7 +60,6 @@ extern unsigned long from_irq_stack(int nested); extern int singlestepping(void); extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); -extern void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs); extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); extern void fatal_sigsegv(void) __attribute__ ((noreturn)); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 97c8df9c4401..cdaee3e94273 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -201,7 +201,6 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, struct uml_pt_regs *regs) { - jmp_buf *catcher; int si_code; int err; int is_write = FAULT_WRITE(fi); @@ -246,15 +245,8 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, address = 0; } - catcher = current->thread.fault_catcher; if (!err) goto out; - else if (catcher != NULL) { - current->thread.fault_addr = (void *) address; - UML_LONGJMP(catcher, 1); - } - else if (current->thread.fault_addr != NULL) - panic("fault_addr set but no fault catcher"); else if (!is_user && arch_fixup(ip, regs)) goto out; @@ -310,14 +302,6 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) } } -void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs) -{ - if (current->thread.fault_catcher != NULL) - UML_LONGJMP(current->thread.fault_catcher, 1); - else - relay_signal(sig, si, regs); -} - void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { do_IRQ(WINCH_IRQ, regs); diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index b11ed66c8bb0..1978eaa557e9 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -26,7 +26,7 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { [SIGFPE] = relay_signal, [SIGILL] = relay_signal, [SIGWINCH] = winch, - [SIGBUS] = bus_handler, + [SIGBUS] = relay_signal, [SIGSEGV] = segv_handler, [SIGIO] = sigio_handler, }; -- cgit From a34d105350b2d4b28cc2429414a16cbeeab7d31a Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:33 +0800 Subject: um: Remove UML specific debug parameter It does nothing but emit a warning when 'debug' is provided in the kernel command line. It can be a bit annoying, as 'debug' is also a valid kernel parameter to enable kernel debugging. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/um_arch.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 38cbb41a64bc..6d755a37d5c4 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -167,19 +167,6 @@ __uml_setup("root=", uml_root_setup, " root=/dev/ubd5\n\n" ); -static int __init no_skas_debug_setup(char *line, int *add) -{ - os_warn("'debug' is not necessary to gdb UML in skas mode - run\n"); - os_warn("'gdb linux'\n"); - - return 0; -} - -__uml_setup("debug", no_skas_debug_setup, -"debug\n" -" this flag is not needed to run gdb on UML in skas mode\n\n" -); - static int __init uml_console_setup(char *line, int *add) { have_console = 1; -- cgit From cb055b2135d82db48ca0a3ff744b158fd5400eb2 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:34 +0800 Subject: um: Do not propagate mem parameter to kernel This parameter is UML specific and is unknown to kernel. It should not be propagated to kernel, otherwise it will be passed to user space as an environment option by kernel with a warning like: Unknown kernel command line parameters "mem=2G", will be passed to user space. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/physmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 636830fe00f2..d60df3626727 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -139,6 +139,8 @@ EXPORT_SYMBOL(phys_mapping); static int __init uml_mem_setup(char *line, int *add) { char *retptr; + + *add = 0; physmem_size = memparse(line,&retptr); return 0; } -- cgit From 5c78a58388e732974986832e6f70c7f58ee3415e Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:35 +0800 Subject: um: Do not propagate uml_dir parameter to kernel This parameter is UML specific and is unknown to kernel. It should not be propagated to kernel, otherwise it will be passed to user space as an environment option by kernel with a warning like: Unknown kernel command line parameters "uml_dir=/foo", will be passed to user space. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-4-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/os-Linux/umid.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index e09d65b05d1c..eb523ab1e218 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -358,6 +358,8 @@ char *get_umid(void) static int __init set_uml_dir(char *name, int *add) { + *add = 0; + if (*name == '\0') { os_warn("uml_dir can't be an empty string\n"); return 0; -- cgit From 7da0c611579b722ac649cd5bc4a0e8151f70e9ee Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:36 +0800 Subject: um: Do not propagate dtb parameter to kernel This parameter is UML specific and is unknown to kernel. It should not be propagated to kernel, otherwise it will be passed to user space as an environment option by kernel with a warning like: Unknown kernel command line parameters "dtb=/foo", will be passed to user space. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-5-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/dtb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c index 4954188a6a09..73369446ed1e 100644 --- a/arch/um/kernel/dtb.c +++ b/arch/um/kernel/dtb.c @@ -31,6 +31,7 @@ void uml_dtb_init(void) static int __init uml_dtb_setup(char *line, int *add) { + *add = 0; dtb = line; return 0; } -- cgit From 45aa6026d16792bedf5be5aaf70b2779cc608a28 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:37 +0800 Subject: um: Do not propagate noreboot parameter to kernel This parameter is UML specific and is unknown to kernel. It should not be propagated to kernel, otherwise it could be passed to user space as a command line option by kernel with a warning like: Unknown kernel command line parameters "noreboot", will be passed to user space. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-6-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/os-Linux/skas/process.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 2286486bb0f3..8b328eb9d1f7 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -622,6 +622,7 @@ static bool noreboot; static int __init noreboot_cmd_param(char *str, int *add) { + *add = 0; noreboot = true; return 0; } -- cgit From d26627b2c7b50c0b8a5ee3cafa7a3e3913c25a92 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:38 +0800 Subject: hostfs: Do not propagate hostfs parameter to kernel This parameter is UML specific and is unknown to kernel. It should not be propagated to kernel, otherwise it will be passed to user space as an environment option by kernel with a warning like: Unknown kernel command line parameters "hostfs=/foo", will be passed to user space. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-7-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- fs/hostfs/hostfs_kern.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 6d1cf2436ead..8d47c6b70c9f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -57,6 +57,7 @@ static int __init hostfs_args(char *options, int *add) { char *ptr; + *add = 0; ptr = strchr(options, ','); if (ptr != NULL) *ptr++ = '\0'; -- cgit From 4e2e4ea0d8029874ef7d8f326e91c6ec6993a1bf Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:39 +0800 Subject: um: hostaudio: Do not propagate dsp parameter to kernel This parameter is UML specific and is unknown to kernel. It should not be propagated to kernel, otherwise it will trigger a warning and be passed to user space as an environment option. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-8-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/hostaudio_kern.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index 9d228878cea2..09af903b75ae 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -48,6 +48,7 @@ MODULE_PARM_DESC(mixer, MIXER_HELP); #ifndef MODULE static int set_dsp(char *name, int *add) { + *add = 0; dsp = name; return 0; } -- cgit From 3f48113df3495540fb492760dfb092deaccc9f7d Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:40 +0800 Subject: um: hostaudio: Do not propagate mixer parameter to kernel This parameter is UML specific and is unknown to kernel. It should not be propagated to kernel, otherwise it will trigger a warning and be passed to user space as an environment option. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-9-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/hostaudio_kern.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index 09af903b75ae..0ac149de1ac0 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -57,6 +57,7 @@ __uml_setup("dsp=", set_dsp, "dsp=\n" DSP_HELP); static int set_mixer(char *name, int *add) { + *add = 0; mixer = name; return 0; } -- cgit From b9ee5fc8f4aa8c6547f9853e0480949c155163cb Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 12:04:41 +0800 Subject: um: Do not propagate initrd parameter to kernel This parameter is UML specific. It specifies the name of the file containing the initrd image, which is unknown to kernel. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011040441.1586345-10-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/initrd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 47b8cb1a1156..99dba827461c 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -34,6 +34,7 @@ int __init read_initrd(void) static int __init uml_initrd_setup(char *line, int *add) { + *add = 0; initrd = line; return 0; } -- cgit From 90daca7c8f6f33e9482591446d2e76b18a21be49 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 11 Oct 2024 11:18:26 +0200 Subject: um: vdso: Always reject undefined references in during linking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using a custom script to detect and fail on undefined references, use --no-undefined for all VDSO linker invocations. Drop the now unused checkundef.sh script. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20241011-vdso-checkundef-v1-2-1a46e0352d20@linutronix.de Signed-off-by: Johannes Berg --- arch/x86/um/vdso/Makefile | 5 ++--- arch/x86/um/vdso/checkundef.sh | 11 ----------- 2 files changed, 2 insertions(+), 14 deletions(-) delete mode 100644 arch/x86/um/vdso/checkundef.sh diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 6a77ea6434ff..7478d11dacb7 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -56,7 +56,6 @@ CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage quiet_cmd_vdso = VDSO $@ cmd_vdso = $(CC) -nostdlib -o $@ \ $(CC_FLAGS_LTO) $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(src)/checkundef.sh '$(NM)' '$@' + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) -VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv -z noexecstack +VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv -z noexecstack -Wl,--no-undefined diff --git a/arch/x86/um/vdso/checkundef.sh b/arch/x86/um/vdso/checkundef.sh deleted file mode 100644 index 8e3ea6bb956f..000000000000 --- a/arch/x86/um/vdso/checkundef.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -nm="$1" -file="$2" -$nm "$file" | grep '^ *U' > /dev/null 2>&1 -if [ $? -eq 1 ]; then - exit 0 -else - echo "$file: undefined symbols found" >&2 - exit 1 -fi -- cgit From 2717c6b649e1840328c2758a478bf4034a22ac3e Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 18:23:53 +0800 Subject: um: Abandon the _PAGE_NEWPROT bit When a PTE is updated in the page table, the _PAGE_NEWPAGE bit will always be set. And the corresponding page will always be mapped or unmapped depending on whether the PTE is present or not. The check on the _PAGE_NEWPROT bit is not really reachable. Abandoning it will allow us to simplify the code and remove the unreachable code. Reviewed-by: Benjamin Berg Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011102354.1682626-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/include/asm/pgtable.h | 37 +++--------------- arch/um/include/asm/tlbflush.h | 4 +- arch/um/include/shared/os.h | 2 - arch/um/include/shared/skas/stub-data.h | 1 - arch/um/kernel/skas/stub.c | 10 ----- arch/um/kernel/tlb.c | 66 ++++++++++++++------------------- arch/um/os-Linux/skas/mem.c | 21 ----------- 7 files changed, 36 insertions(+), 105 deletions(-) diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index bd7a9593705f..6b1317400190 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -12,7 +12,6 @@ #define _PAGE_PRESENT 0x001 #define _PAGE_NEWPAGE 0x002 -#define _PAGE_NEWPROT 0x004 #define _PAGE_RW 0x020 #define _PAGE_USER 0x040 #define _PAGE_ACCESSED 0x080 @@ -151,23 +150,12 @@ static inline int pte_newpage(pte_t pte) return pte_get_bits(pte, _PAGE_NEWPAGE); } -static inline int pte_newprot(pte_t pte) -{ - return(pte_present(pte) && (pte_get_bits(pte, _PAGE_NEWPROT))); -} - /* * ================================= * Flags setting section. * ================================= */ -static inline pte_t pte_mknewprot(pte_t pte) -{ - pte_set_bits(pte, _PAGE_NEWPROT); - return(pte); -} - static inline pte_t pte_mkclean(pte_t pte) { pte_clear_bits(pte, _PAGE_DIRTY); @@ -182,19 +170,14 @@ static inline pte_t pte_mkold(pte_t pte) static inline pte_t pte_wrprotect(pte_t pte) { - if (likely(pte_get_bits(pte, _PAGE_RW))) - pte_clear_bits(pte, _PAGE_RW); - else - return pte; - return(pte_mknewprot(pte)); + pte_clear_bits(pte, _PAGE_RW); + return pte; } static inline pte_t pte_mkread(pte_t pte) { - if (unlikely(pte_get_bits(pte, _PAGE_USER))) - return pte; pte_set_bits(pte, _PAGE_USER); - return(pte_mknewprot(pte)); + return pte; } static inline pte_t pte_mkdirty(pte_t pte) @@ -211,18 +194,14 @@ static inline pte_t pte_mkyoung(pte_t pte) static inline pte_t pte_mkwrite_novma(pte_t pte) { - if (unlikely(pte_get_bits(pte, _PAGE_RW))) - return pte; pte_set_bits(pte, _PAGE_RW); - return(pte_mknewprot(pte)); + return pte; } static inline pte_t pte_mkuptodate(pte_t pte) { pte_clear_bits(pte, _PAGE_NEWPAGE); - if(pte_present(pte)) - pte_clear_bits(pte, _PAGE_NEWPROT); - return(pte); + return pte; } static inline pte_t pte_mknewpage(pte_t pte) @@ -236,12 +215,10 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval) pte_copy(*pteptr, pteval); /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so - * fix_range knows to unmap it. _PAGE_NEWPROT is specific to - * mapped pages. + * update_pte_range knows to unmap it. */ *pteptr = pte_mknewpage(*pteptr); - if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); } #define PFN_PTE_SHIFT PAGE_SHIFT @@ -298,8 +275,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) ({ pte_t pte; \ \ pte_set_val(pte, page_to_phys(page), (pgprot)); \ - if (pte_present(pte)) \ - pte_mknewprot(pte_mknewpage(pte)); \ pte;}) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) diff --git a/arch/um/include/asm/tlbflush.h b/arch/um/include/asm/tlbflush.h index db997976b6ea..13a3009942be 100644 --- a/arch/um/include/asm/tlbflush.h +++ b/arch/um/include/asm/tlbflush.h @@ -9,8 +9,8 @@ #include /* - * In UML, we need to sync the TLB over by using mmap/munmap/mprotect syscalls - * from the process handling the MM (which can be the kernel itself). + * In UML, we need to sync the TLB over by using mmap/munmap syscalls from + * the process handling the MM (which can be the kernel itself). * * To track updates, we can hook into set_ptes and flush_tlb_*. With set_ptes * we catch all PTE transitions where memory that was unusable becomes usable. diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index bf539fee7831..09f8201de5db 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -279,8 +279,6 @@ int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, int phys_fd, unsigned long long offset); int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len); -int protect(struct mm_id *mm_idp, unsigned long addr, - unsigned long len, unsigned int prot); /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 3fbdda727373..81a4cace032c 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -30,7 +30,6 @@ enum stub_syscall_type { STUB_SYSCALL_UNSET = 0, STUB_SYSCALL_MMAP, STUB_SYSCALL_MUNMAP, - STUB_SYSCALL_MPROTECT, }; struct stub_syscall { diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c index 5d52ffa682dc..796fc266d3bb 100644 --- a/arch/um/kernel/skas/stub.c +++ b/arch/um/kernel/skas/stub.c @@ -35,16 +35,6 @@ static __always_inline int syscall_handler(struct stub_data *d) return -1; } break; - case STUB_SYSCALL_MPROTECT: - res = stub_syscall3(__NR_mprotect, - sc->mem.addr, sc->mem.length, - sc->mem.prot); - if (res) { - d->err = res; - d->syscall_data_len = i; - return -1; - } - break; default: d->err = -95; /* EOPNOTSUPP */ d->syscall_data_len = i; diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 548af31d4111..23c1f550cd7c 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -23,9 +23,6 @@ struct vm_ops { int phys_fd, unsigned long long offset); int (*unmap)(struct mm_id *mm_idp, unsigned long virt, unsigned long len); - int (*mprotect)(struct mm_id *mm_idp, - unsigned long virt, unsigned long len, - unsigned int prot); }; static int kern_map(struct mm_id *mm_idp, @@ -44,15 +41,6 @@ static int kern_unmap(struct mm_id *mm_idp, return os_unmap_memory((void *)virt, len); } -static int kern_mprotect(struct mm_id *mm_idp, - unsigned long virt, unsigned long len, - unsigned int prot) -{ - return os_protect_memory((void *)virt, len, - prot & UM_PROT_READ, prot & UM_PROT_WRITE, - 1); -} - void report_enomem(void) { printk(KERN_ERR "UML ran out of memory on the host side! " @@ -65,33 +53,37 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, struct vm_ops *ops) { pte_t *pte; - int r, w, x, prot, ret = 0; + int ret = 0; pte = pte_offset_kernel(pmd, addr); do { - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) - w = 0; - - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (pte_newpage(*pte)) { - if (pte_present(*pte)) { - __u64 offset; - unsigned long phys = pte_val(*pte) & PAGE_MASK; - int fd = phys_mapping(phys, &offset); - - ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, - prot, fd, offset); - } else - ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); - } else if (pte_newprot(*pte)) - ret = ops->mprotect(ops->mm_idp, addr, PAGE_SIZE, prot); + if (!pte_newpage(*pte)) + continue; + + if (pte_present(*pte)) { + __u64 offset; + unsigned long phys = pte_val(*pte) & PAGE_MASK; + int fd = phys_mapping(phys, &offset); + int r, w, x, prot; + + r = pte_read(*pte); + w = pte_write(*pte); + x = pte_exec(*pte); + if (!pte_young(*pte)) { + r = 0; + w = 0; + } else if (!pte_dirty(*pte)) + w = 0; + + prot = (r ? UM_PROT_READ : 0) | + (w ? UM_PROT_WRITE : 0) | + (x ? UM_PROT_EXEC : 0); + + ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, + prot, fd, offset); + } else + ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); + *pte = pte_mkuptodate(*pte); } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); return ret; @@ -180,11 +172,9 @@ int um_tlb_sync(struct mm_struct *mm) if (mm == &init_mm) { ops.mmap = kern_map; ops.unmap = kern_unmap; - ops.mprotect = kern_mprotect; } else { ops.mmap = map; ops.unmap = unmap; - ops.mprotect = protect; } pgd = pgd_offset(mm, addr); diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 9a13ac23c606..d7f1814b0e5a 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -217,24 +217,3 @@ int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) return 0; } - -int protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, - unsigned int prot) -{ - struct stub_syscall *sc; - - /* Compress with previous syscall if that is possible */ - sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MPROTECT, addr); - if (sc && sc->mem.prot == prot) { - sc->mem.length += len; - return 0; - } - - sc = syscall_stub_alloc(mm_idp); - sc->syscall = STUB_SYSCALL_MPROTECT; - sc->mem.addr = addr; - sc->mem.length = len; - sc->mem.prot = prot; - - return 0; -} -- cgit From 9b0881858c74ae6a1a66de7350d123cf3f83169f Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 11 Oct 2024 18:23:54 +0800 Subject: um: Rename _PAGE_NEWPAGE to _PAGE_NEEDSYNC The _PAGE_NEWPAGE bit does not really indicate that this is a new page, but rather whether this entry needs to be synced or not. Renaming it to _PAGE_NEEDSYNC will make it more clear how everything ties together. Suggested-by: Benjamin Berg Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241011102354.1682626-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/include/asm/page.h | 2 +- arch/um/include/asm/pgtable-2level.h | 2 +- arch/um/include/asm/pgtable-4level.h | 14 ++++++------- arch/um/include/asm/pgtable.h | 38 ++++++++++++++++++------------------ arch/um/kernel/tlb.c | 10 +++++----- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index f0ad80fc8c10..99aa459c7c6c 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -56,7 +56,7 @@ typedef struct { unsigned long pud; } pud_t; #define pte_set_bits(p, bits) ((p).pte |= (bits)) #define pte_clear_bits(p, bits) ((p).pte &= ~(bits)) #define pte_copy(to, from) ((to).pte = (from).pte) -#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) +#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEEDSYNC)) #define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot)) typedef unsigned long phys_t; diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h index 8256ecc5b919..ab0c8dd86564 100644 --- a/arch/um/include/asm/pgtable-2level.h +++ b/arch/um/include/asm/pgtable-2level.h @@ -31,7 +31,7 @@ printk("%s:%d: bad pgd %p(%08lx).\n", __FILE__, __LINE__, &(e), \ pgd_val(e)) -static inline int pgd_newpage(pgd_t pgd) { return 0; } +static inline int pgd_needsync(pgd_t pgd) { return 0; } static inline void pgd_mkuptodate(pgd_t pgd) { } #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) diff --git a/arch/um/include/asm/pgtable-4level.h b/arch/um/include/asm/pgtable-4level.h index f912fcc16b7a..0d279caee93c 100644 --- a/arch/um/include/asm/pgtable-4level.h +++ b/arch/um/include/asm/pgtable-4level.h @@ -55,7 +55,7 @@ printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ pgd_val(e)) -#define pud_none(x) (!(pud_val(x) & ~_PAGE_NEWPAGE)) +#define pud_none(x) (!(pud_val(x) & ~_PAGE_NEEDSYNC)) #define pud_bad(x) ((pud_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pud_present(x) (pud_val(x) & _PAGE_PRESENT) #define pud_populate(mm, pud, pmd) \ @@ -63,7 +63,7 @@ #define set_pud(pudptr, pudval) (*(pudptr) = (pudval)) -#define p4d_none(x) (!(p4d_val(x) & ~_PAGE_NEWPAGE)) +#define p4d_none(x) (!(p4d_val(x) & ~_PAGE_NEEDSYNC)) #define p4d_bad(x) ((p4d_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define p4d_present(x) (p4d_val(x) & _PAGE_PRESENT) #define p4d_populate(mm, p4d, pud) \ @@ -72,23 +72,23 @@ #define set_p4d(p4dptr, p4dval) (*(p4dptr) = (p4dval)) -static inline int pgd_newpage(pgd_t pgd) +static inline int pgd_needsync(pgd_t pgd) { - return(pgd_val(pgd) & _PAGE_NEWPAGE); + return pgd_val(pgd) & _PAGE_NEEDSYNC; } -static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; } +static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEEDSYNC; } #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) static inline void pud_clear (pud_t *pud) { - set_pud(pud, __pud(_PAGE_NEWPAGE)); + set_pud(pud, __pud(_PAGE_NEEDSYNC)); } static inline void p4d_clear (p4d_t *p4d) { - set_p4d(p4d, __p4d(_PAGE_NEWPAGE)); + set_p4d(p4d, __p4d(_PAGE_NEEDSYNC)); } #define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK) diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 6b1317400190..12bfc58a8580 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -11,7 +11,7 @@ #include #define _PAGE_PRESENT 0x001 -#define _PAGE_NEWPAGE 0x002 +#define _PAGE_NEEDSYNC 0x002 #define _PAGE_RW 0x020 #define _PAGE_USER 0x040 #define _PAGE_ACCESSED 0x080 @@ -79,22 +79,22 @@ extern unsigned long end_iomem; */ #define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) -#define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) +#define pte_clear(mm, addr, xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEEDSYNC)) -#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE)) +#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEEDSYNC)) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) -#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) +#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEEDSYNC; } while (0) -#define pmd_newpage(x) (pmd_val(x) & _PAGE_NEWPAGE) -#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEWPAGE) +#define pmd_needsync(x) (pmd_val(x) & _PAGE_NEEDSYNC) +#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEEDSYNC) -#define pud_newpage(x) (pud_val(x) & _PAGE_NEWPAGE) -#define pud_mkuptodate(x) (pud_val(x) &= ~_PAGE_NEWPAGE) +#define pud_needsync(x) (pud_val(x) & _PAGE_NEEDSYNC) +#define pud_mkuptodate(x) (pud_val(x) &= ~_PAGE_NEEDSYNC) -#define p4d_newpage(x) (p4d_val(x) & _PAGE_NEWPAGE) -#define p4d_mkuptodate(x) (p4d_val(x) &= ~_PAGE_NEWPAGE) +#define p4d_needsync(x) (p4d_val(x) & _PAGE_NEEDSYNC) +#define p4d_mkuptodate(x) (p4d_val(x) &= ~_PAGE_NEEDSYNC) #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) phys_to_page(pmd_val(pmd) & PAGE_MASK) @@ -145,9 +145,9 @@ static inline int pte_young(pte_t pte) return pte_get_bits(pte, _PAGE_ACCESSED); } -static inline int pte_newpage(pte_t pte) +static inline int pte_needsync(pte_t pte) { - return pte_get_bits(pte, _PAGE_NEWPAGE); + return pte_get_bits(pte, _PAGE_NEEDSYNC); } /* @@ -200,13 +200,13 @@ static inline pte_t pte_mkwrite_novma(pte_t pte) static inline pte_t pte_mkuptodate(pte_t pte) { - pte_clear_bits(pte, _PAGE_NEWPAGE); + pte_clear_bits(pte, _PAGE_NEEDSYNC); return pte; } -static inline pte_t pte_mknewpage(pte_t pte) +static inline pte_t pte_mkneedsync(pte_t pte) { - pte_set_bits(pte, _PAGE_NEWPAGE); + pte_set_bits(pte, _PAGE_NEEDSYNC); return(pte); } @@ -214,11 +214,11 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval) { pte_copy(*pteptr, pteval); - /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so + /* If it's a swap entry, it needs to be marked _PAGE_NEEDSYNC so * update_pte_range knows to unmap it. */ - *pteptr = pte_mknewpage(*pteptr); + *pteptr = pte_mkneedsync(*pteptr); } #define PFN_PTE_SHIFT PAGE_SHIFT @@ -258,7 +258,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { - return !((pte_val(pte_a) ^ pte_val(pte_b)) & ~_PAGE_NEWPAGE); + return !((pte_val(pte_a) ^ pte_val(pte_b)) & ~_PAGE_NEEDSYNC); } /* @@ -308,7 +308,7 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); * <--------------- offset ----------------> E < type -> 0 0 0 1 0 * * E is the exclusive marker that is not stored in swap entries. - * _PAGE_NEWPAGE (bit 1) is always set to 1 in set_pte(). + * _PAGE_NEEDSYNC (bit 1) is always set to 1 in set_pte(). */ #define __swp_type(x) (((x).val >> 5) & 0x1f) #define __swp_offset(x) ((x).val >> 11) diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 23c1f550cd7c..cf7e0d4407f2 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -57,7 +57,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, pte = pte_offset_kernel(pmd, addr); do { - if (!pte_newpage(*pte)) + if (!pte_needsync(*pte)) continue; if (pte_present(*pte)) { @@ -101,7 +101,7 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) { - if (pmd_newpage(*pmd)) { + if (pmd_needsync(*pmd)) { ret = ops->unmap(ops->mm_idp, addr, next - addr); pmd_mkuptodate(*pmd); @@ -124,7 +124,7 @@ static inline int update_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end); if (!pud_present(*pud)) { - if (pud_newpage(*pud)) { + if (pud_needsync(*pud)) { ret = ops->unmap(ops->mm_idp, addr, next - addr); pud_mkuptodate(*pud); @@ -147,7 +147,7 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); if (!p4d_present(*p4d)) { - if (p4d_newpage(*p4d)) { + if (p4d_needsync(*p4d)) { ret = ops->unmap(ops->mm_idp, addr, next - addr); p4d_mkuptodate(*p4d); @@ -181,7 +181,7 @@ int um_tlb_sync(struct mm_struct *mm) do { next = pgd_addr_end(addr, mm->context.sync_tlb_range_to); if (!pgd_present(*pgd)) { - if (pgd_newpage(*pgd)) { + if (pgd_needsync(*pgd)) { ret = ops.unmap(ops.mm_idp, addr, next - addr); pgd_mkuptodate(*pgd); -- cgit From 0b8b2668f9981c1fefc2ef892bd915288ef01f33 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 10 Oct 2024 16:25:37 +0200 Subject: um: insert scheduler ticks when userspace does not yield In time-travel mode userspace can do a lot of work without any time passing. Unfortunately, this can result in OOM situations as the RCU core code will never be run. Work around this by keeping track of userspace processes that do not yield for a lot of operations. When this happens, insert a jiffie into the sched_clock clock to account time against the process and cause the bookkeeping to run. As sched_clock is used for tracing, it is useful to keep it in sync between the different VMs. As such, try to remove added ticks again when the actual clock ticks. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241010142537.1134685-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/Kconfig | 15 +++++++++++++++ arch/um/include/shared/common-offsets.h | 6 +++++- arch/um/kernel/time.c | 20 ++++++++++++++++++++ arch/um/os-Linux/skas/process.c | 26 ++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 1 deletion(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 448454a3d8b5..5dc702ad9e7a 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -227,6 +227,21 @@ config UML_TIME_TRAVEL_SUPPORT It is safe to say Y, but you probably don't need this. +config UML_MAX_USERSPACE_ITERATIONS + int + prompt "Maximum number of unscheduled userspace iterations" + default 10000 + depends on UML_TIME_TRAVEL_SUPPORT + help + In UML inf-cpu and ext time-travel mode userspace can run without being + interrupted. This will eventually overwhelm the kernel and create OOM + situations (mainly RCU not running). This setting specifies the number + of kernel/userspace switches (minor/major page fault, signal or syscall) + for the same userspace thread before the sched_clock is advanced by a + jiffie to trigger scheduling. + + Setting it to zero disables the feature. + config KASAN_SHADOW_OFFSET hex depends on KASAN diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 579ed946a3a9..86537e20942a 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -28,4 +28,8 @@ DEFINE(UML_CONFIG_64BIT, CONFIG_64BIT); #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT DEFINE(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT, CONFIG_UML_TIME_TRAVEL_SUPPORT); #endif - +#ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS +DEFINE(UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS, CONFIG_UML_MAX_USERSPACE_ITERATIONS); +#else +DEFINE(UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS, 0); +#endif diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 29b27b90581f..1394568c0210 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -25,6 +25,8 @@ #include #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +#include + enum time_travel_mode time_travel_mode; EXPORT_SYMBOL_GPL(time_travel_mode); @@ -47,6 +49,15 @@ static u16 time_travel_shm_id; static struct um_timetravel_schedshm *time_travel_shm; static union um_timetravel_schedshm_client *time_travel_shm_client; +unsigned long tt_extra_sched_jiffies; + +notrace unsigned long long sched_clock(void) +{ + return (unsigned long long)(jiffies - INITIAL_JIFFIES + + tt_extra_sched_jiffies) + * (NSEC_PER_SEC / HZ); +} + static void time_travel_set_time(unsigned long long ns) { if (unlikely(ns < time_travel_time)) @@ -443,6 +454,11 @@ static void time_travel_periodic_timer(struct time_travel_event *e) { time_travel_add_event(&time_travel_timer_event, time_travel_time + time_travel_timer_interval); + + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + deliver_alarm(); } @@ -594,6 +610,10 @@ EXPORT_SYMBOL_GPL(time_travel_add_irq_event); static void time_travel_oneshot_timer(struct time_travel_event *e) { + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + deliver_alarm(); } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 8b328eb9d1f7..97856955e892 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -388,6 +388,9 @@ int start_userspace(unsigned long stub_stack) return err; } +int unscheduled_userspace_iterations; +extern unsigned long tt_extra_sched_jiffies; + void userspace(struct uml_pt_regs *regs) { int err, status, op, pid = userspace_pid[0]; @@ -397,6 +400,27 @@ void userspace(struct uml_pt_regs *regs) interrupt_end(); while (1) { + /* + * When we are in time-travel mode, userspace can theoretically + * do a *lot* of work without being scheduled. The problem with + * this is that it will prevent kernel bookkeeping (primarily + * the RCU) from running and this can for example cause OOM + * situations. + * + * This code accounts a jiffie against the scheduling clock + * after the defined userspace iterations in the same thread. + * By doing so the situation is effectively prevented. + */ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { + if (UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS && + unscheduled_userspace_iterations++ > + UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS) { + tt_extra_sched_jiffies += 1; + unscheduled_userspace_iterations = 0; + } + } + time_travel_print_bc_msg(); current_mm_sync(); @@ -539,6 +563,8 @@ void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) void switch_threads(jmp_buf *me, jmp_buf *you) { + unscheduled_userspace_iterations = 0; + if (UML_SETJMP(me) == 0) UML_LONGJMP(you, 1); } -- cgit From 3f17fed2149192c7d3b76a45a6a87b4ff22cd586 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 23 Oct 2024 11:41:20 +0200 Subject: um: switch to regset API and depend on XSTATE The PTRACE_GETREGSET API has now existed since Linux 2.6.33. The XSAVE CPU feature should also be sufficiently common to be able to rely on it. With this, define our internal FP state to be the hosts XSAVE data. Add discovery for the hosts XSAVE size and place the FP registers at the end of task_struct so that we can adjust the size at runtime. Next we can implement the regset API on top and update the signal handling as well as ptrace APIs to use them. Also switch coredump creation to use the regset API and finally set HAVE_ARCH_TRACEHOOK. This considerably improves the signal frames. Previously they might not have contained all the registers (i386) and also did not have the sizes and magic values set to the correct values to permit userspace to decode the frame. As a side effect, this will permit UML to run on hosts with newer CPU extensions (such as AMX) that need even more register state. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241023094120.4083426-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/Kconfig | 2 + arch/um/include/asm/processor-generic.h | 4 +- arch/um/kernel/process.c | 22 +-- arch/um/kernel/um_arch.c | 2 + arch/um/os-Linux/registers.c | 11 +- arch/x86/um/Makefile | 2 +- arch/x86/um/asm/elf.h | 2 + arch/x86/um/asm/ptrace.h | 10 + arch/x86/um/os-Linux/registers.c | 145 ++++---------- arch/x86/um/ptrace.c | 267 ++++++++++++++++++++++++++ arch/x86/um/ptrace_32.c | 88 ++------- arch/x86/um/ptrace_64.c | 37 +--- arch/x86/um/shared/sysdep/ptrace.h | 8 +- arch/x86/um/shared/sysdep/ptrace_32.h | 4 - arch/x86/um/shared/sysdep/ptrace_64.h | 4 - arch/x86/um/shared/sysdep/ptrace_user.h | 6 - arch/x86/um/signal.c | 322 ++++++++++---------------------- arch/x86/um/user-offsets.c | 8 - 18 files changed, 471 insertions(+), 473 deletions(-) create mode 100644 arch/x86/um/ptrace.c diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 5dc702ad9e7a..a9876bdb5bf9 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -5,6 +5,7 @@ menu "UML-specific options" config UML bool default y + select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_HAS_CPU_FINALIZE_INIT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL @@ -32,6 +33,7 @@ config UML select HAVE_ARCH_VMAP_STACK select HAVE_RUST select ARCH_HAS_UBSAN + select HAVE_ARCH_TRACEHOOK config MMU bool diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 02e759a4a435..5d6356eafffe 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -20,7 +20,6 @@ struct task_struct; struct mm_struct; struct thread_struct { - struct pt_regs regs; struct pt_regs *segv_regs; struct task_struct *prev_sched; struct arch_thread arch; @@ -31,6 +30,9 @@ struct thread_struct { void *arg; } thread; } request; + + /* Contains variable sized FP registers */ + struct pt_regs regs; }; #define INIT_THREAD \ diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index d45c79f82d7c..56e7e525fc91 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -187,6 +187,13 @@ void initial_thread_cb(void (*proc)(void *), void *arg) kmalloc_ok = save_kmalloc_ok; } +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + memcpy(dst, src, arch_task_struct_size); + return 0; +} + void um_idle_sleep(void) { if (time_travel_mode != TT_MODE_OFF) @@ -287,18 +294,3 @@ unsigned long __get_wchan(struct task_struct *p) return 0; } - -int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) -{ -#ifdef CONFIG_X86_32 - extern int have_fpx_regs; - - /* FIXME: A plain copy does not work on i386 with have_fpx_regs */ - if (have_fpx_regs) - return 0; -#endif - memcpy(fpu, &t->thread.regs.regs.fp, sizeof(*fpu)); - - return 1; -} - diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 6d755a37d5c4..ec17576ce9fc 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -402,6 +402,8 @@ int __init linux_main(int argc, char **argv, char **envp) os_info("Kernel virtual memory size shrunk to %lu bytes\n", virtmem_size); + arch_task_struct_size = sizeof(struct task_struct) + host_fp_size; + os_flush_stdout(); return start_uml(); diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c index bd80b921add0..d7ca148807b2 100644 --- a/arch/um/os-Linux/registers.c +++ b/arch/um/os-Linux/registers.c @@ -10,11 +10,12 @@ #include #include #include +#include /* This is set once at boot time and not changed thereafter */ static unsigned long exec_regs[MAX_REG_NR]; -static unsigned long exec_fp_regs[FP_SIZE]; +static unsigned long *exec_fp_regs; int init_pid_registers(int pid) { @@ -24,7 +25,11 @@ int init_pid_registers(int pid) if (err < 0) return -errno; - arch_init_registers(pid); + err = arch_init_registers(pid); + if (err < 0) + return err; + + exec_fp_regs = malloc(host_fp_size); get_fp_registers(pid, exec_fp_regs); return 0; } @@ -34,5 +39,5 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs) memcpy(regs, exec_regs, sizeof(exec_regs)); if (fp_regs) - memcpy(fp_regs, exec_fp_regs, sizeof(exec_fp_regs)); + memcpy(fp_regs, exec_fp_regs, host_fp_size); } diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 36e67fc97c22..b42c31cd2390 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -10,7 +10,7 @@ else endif obj-y = bugs_$(BITS).o delay.o fault.o \ - ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ + ptrace.o ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ stub_segv.o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ mem_$(BITS).o subarch.o os-Linux/ diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 6052200fe925..62ed5d68a978 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -8,6 +8,8 @@ #include #include +#define CORE_DUMP_USE_REGSET + #ifdef CONFIG_X86_32 #define R_386_NONE 0 diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index 2fef3da55533..2641d28d115c 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -2,6 +2,16 @@ #ifndef __UM_X86_PTRACE_H #define __UM_X86_PTRACE_H +/* This is here because signal.c needs the REGSET_FP_LEGACY definition */ +enum { + REGSET_GENERAL, +#ifdef CONFIG_X86_32 + REGSET_FP_LEGACY, +#endif + REGSET_FP, + REGSET_XSTATE, +}; + #include #ifndef CONFIG_X86_32 #define __FRAME_OFFSETS /* Needed to get the R* macros */ diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index f8e5516d9708..76eaeb93928c 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -16,133 +16,58 @@ #include #include #include +#include -static int have_xstate_support; +unsigned long host_fp_size; -static int save_i387_registers(int pid, unsigned long *fp_regs) -{ - if (ptrace(PTRACE_GETFPREGS, pid, 0, fp_regs) < 0) - return -errno; - return 0; -} - -static int save_fp_registers(int pid, unsigned long *fp_regs) +int get_fp_registers(int pid, unsigned long *regs) { -#ifdef PTRACE_GETREGSET - struct iovec iov; + struct iovec iov = { + .iov_base = regs, + .iov_len = host_fp_size, + }; - if (have_xstate_support) { - iov.iov_base = fp_regs; - iov.iov_len = FP_SIZE * sizeof(unsigned long); - if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0) - return -errno; - return 0; - } else -#endif - return save_i387_registers(pid, fp_regs); -} - -static int restore_i387_registers(int pid, unsigned long *fp_regs) -{ - if (ptrace(PTRACE_SETFPREGS, pid, 0, fp_regs) < 0) + if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0) return -errno; return 0; } -static int restore_fp_registers(int pid, unsigned long *fp_regs) -{ -#ifdef PTRACE_SETREGSET - struct iovec iov; - if (have_xstate_support) { - iov.iov_base = fp_regs; - iov.iov_len = FP_SIZE * sizeof(unsigned long); - if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0) - return -errno; - return 0; - } else -#endif - return restore_i387_registers(pid, fp_regs); -} - -#ifdef __i386__ -int have_fpx_regs = 1; -static int save_fpx_registers(int pid, unsigned long *fp_regs) +int put_fp_registers(int pid, unsigned long *regs) { - if (ptrace(PTRACE_GETFPXREGS, pid, 0, fp_regs) < 0) - return -errno; - return 0; -} + struct iovec iov = { + .iov_base = regs, + .iov_len = host_fp_size, + }; -static int restore_fpx_registers(int pid, unsigned long *fp_regs) -{ - if (ptrace(PTRACE_SETFPXREGS, pid, 0, fp_regs) < 0) + if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0) return -errno; return 0; } -int get_fp_registers(int pid, unsigned long *regs) -{ - if (have_fpx_regs) - return save_fpx_registers(pid, regs); - else - return save_fp_registers(pid, regs); -} - -int put_fp_registers(int pid, unsigned long *regs) -{ - if (have_fpx_regs) - return restore_fpx_registers(pid, regs); - else - return restore_fp_registers(pid, regs); -} - -void arch_init_registers(int pid) -{ - struct user_fpxregs_struct fpx_regs; - int err; - - err = ptrace(PTRACE_GETFPXREGS, pid, 0, &fpx_regs); - if (!err) - return; - - if (errno != EIO) - panic("check_ptrace : PTRACE_GETFPXREGS failed, errno = %d", - errno); - - have_fpx_regs = 0; -} -#else - -int get_fp_registers(int pid, unsigned long *regs) +int arch_init_registers(int pid) { - return save_fp_registers(pid, regs); + struct iovec iov = { + /* Just use plenty of space, it does not cost us anything */ + .iov_len = 2 * 1024 * 1024, + }; + int ret; + + iov.iov_base = mmap(NULL, iov.iov_len, PROT_WRITE | PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (iov.iov_base == MAP_FAILED) + return -ENOMEM; + + /* GDB has x86_xsave_length, which uses x86_cpuid_count */ + ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); + if (ret) + ret = -errno; + munmap(iov.iov_base, 2 * 1024 * 1024); + + host_fp_size = iov.iov_len; + + return ret; } -int put_fp_registers(int pid, unsigned long *regs) -{ - return restore_fp_registers(pid, regs); -} - -void arch_init_registers(int pid) -{ -#ifdef PTRACE_GETREGSET - void * fp_regs; - struct iovec iov; - - fp_regs = malloc(FP_SIZE * sizeof(unsigned long)); - if(fp_regs == NULL) - return; - - iov.iov_base = fp_regs; - iov.iov_len = FP_SIZE * sizeof(unsigned long); - if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) == 0) - have_xstate_support = 1; - - free(fp_regs); -#endif -} -#endif - unsigned long get_thread_reg(int reg, jmp_buf *buf) { switch (reg) { diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c new file mode 100644 index 000000000000..54d924bc45ce --- /dev/null +++ b/arch/x86/um/ptrace.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) +{ + struct _fpxreg *st = NULL; + unsigned long twd = (unsigned long) fxsave->twd; + unsigned long tag; + unsigned long ret = 0xffff0000; + int i; + +#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16) + + for (i = 0; i < 8; i++) { + if (twd & 0x1) { + st = (struct _fpxreg *) FPREG_ADDR(fxsave, i); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = 2; /* Special */ + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) { + tag = 1; /* Zero */ + } else { + tag = 2; /* Special */ + } + break; + default: + if (st->significand[3] & 0x8000) + tag = 0; /* Valid */ + else + tag = 2; /* Special */ + break; + } + } else { + tag = 3; /* Empty */ + } + ret |= (tag << (2 * i)); + twd = twd >> 1; + } + return ret; +} + +/* Get/set the old 32bit i387 registers (pre-FPX) */ +static int fpregs_legacy_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; + int i; + + membuf_store(&to, (unsigned long)fxsave->cwd | 0xffff0000ul); + membuf_store(&to, (unsigned long)fxsave->swd | 0xffff0000ul); + membuf_store(&to, twd_fxsr_to_i387(fxsave)); + membuf_store(&to, fxsave->fip); + membuf_store(&to, fxsave->fcs | ((unsigned long)fxsave->fop << 16)); + membuf_store(&to, fxsave->foo); + membuf_store(&to, fxsave->fos); + + for (i = 0; i < 8; i++) + membuf_write(&to, (void *)fxsave->st_space + i * 16, 10); + + return 0; +} + +static int fpregs_legacy_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; + const struct user_i387_struct *from; + struct user_i387_struct buf; + int i; + + if (ubuf) { + if (copy_from_user(&buf, ubuf, sizeof(buf))) + return -EFAULT; + from = &buf; + } else { + from = kbuf; + } + + fxsave->cwd = (unsigned short)(from->cwd & 0xffff); + fxsave->swd = (unsigned short)(from->swd & 0xffff); + fxsave->twd = twd_i387_to_fxsr((unsigned short)(from->twd & 0xffff)); + fxsave->fip = from->fip; + fxsave->fop = (unsigned short)((from->fcs & 0xffff0000ul) >> 16); + fxsave->fcs = (from->fcs & 0xffff); + fxsave->foo = from->foo; + fxsave->fos = from->fos; + + for (i = 0; i < 8; i++) { + memcpy((void *)fxsave->st_space + i * 16, + (void *)from->st_space + i * 10, 10); + } + + return 0; +} +#endif + +static int genregs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + int reg; + + for (reg = 0; to.left; reg++) + membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); + return 0; +} + +static int genregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + + if (kbuf) { + const unsigned long *k = kbuf; + + while (count >= sizeof(*k) && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + + while (count >= sizeof(*u) && !ret) { + unsigned long word; + + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static int generic_fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return regset->n; +} + +static int generic_fpregs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + void *fpregs = task_pt_regs(target)->regs.fp; + + membuf_write(&to, fpregs, regset->size * regset->n); + return 0; +} + +static int generic_fpregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + void *fpregs = task_pt_regs(target)->regs.fp; + + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + fpregs, 0, regset->size * regset->n); +} + +static struct user_regset uml_regsets[] __ro_after_init = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .regset_get = genregs_get, + .set = genregs_set + }, +#ifdef CONFIG_X86_32 + /* Old FP registers, they are needed in signal frames */ + [REGSET_FP_LEGACY] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_ia32_struct) / sizeof(long), + .size = sizeof(long), + .align = sizeof(long), + .active = generic_fpregs_active, + .regset_get = fpregs_legacy_get, + .set = fpregs_legacy_set, + }, +#endif + [REGSET_FP] = { +#ifdef CONFIG_X86_32 + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct user32_fxsr_struct) / sizeof(long), +#else + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(long), +#endif + .size = sizeof(long), + .align = sizeof(long), + .active = generic_fpregs_active, + .regset_get = generic_fpregs_get, + .set = generic_fpregs_set, + }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(long), + .align = sizeof(long), + .active = generic_fpregs_active, + .regset_get = generic_fpregs_get, + .set = generic_fpregs_set, + }, + /* TODO: Add TLS regset for 32bit */ +}; + +const struct user_regset_view user_uml_view = { +#ifdef CONFIG_X86_32 + .name = "i386", .e_machine = EM_386, +#else + .name = "x86_64", .e_machine = EM_X86_64, +#endif + .regsets = uml_regsets, .n = ARRAY_SIZE(uml_regsets) +}; + +const struct user_regset_view * +task_user_regset_view(struct task_struct *tsk) +{ + return &user_uml_view; +} + +static int __init init_regset_xstate_info(void) +{ + uml_regsets[REGSET_XSTATE].n = + host_fp_size / uml_regsets[REGSET_XSTATE].size; + + return 0; +} +arch_initcall(init_regset_xstate_info); diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index b8b85a52eb6f..3af3cb821524 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -168,69 +169,6 @@ int peek_user(struct task_struct *child, long addr, long data) return put_user(tmp, (unsigned long __user *) data); } -/* FIXME: Do the required conversions instead of erroring out */ -extern int have_fpx_regs; - -static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) -{ - int n; - - if (have_fpx_regs) - return -EINVAL; - - n = copy_to_user(buf, &child->thread.regs.regs.fp, - sizeof(struct user_i387_struct)); - if(n > 0) - return -EFAULT; - - return n; -} - -static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) -{ - int n; - - if (have_fpx_regs) - return -EINVAL; - - n = copy_from_user(&child->thread.regs.regs.fp, buf, - sizeof(struct user_i387_struct)); - if (n > 0) - return -EFAULT; - - return 0; -} - -static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) -{ - int n; - - if (!have_fpx_regs) - return -EINVAL; - - n = copy_to_user(buf, &child->thread.regs.regs.fp, - sizeof(struct user_fxsr_struct)); - if(n > 0) - return -EFAULT; - - return n; -} - -static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) -{ - int n; - - if (!have_fpx_regs) - return -EINVAL; - - n = copy_from_user(&child->thread.regs.regs.fp, buf, - sizeof(struct user_fxsr_struct)); - if (n > 0) - return -EFAULT; - - return 0; -} - long subarch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { @@ -238,17 +176,25 @@ long subarch_ptrace(struct task_struct *child, long request, void __user *datap = (void __user *) data; switch (request) { case PTRACE_GETFPREGS: /* Get the child FPU state. */ - ret = get_fpregs(datap, child); - break; + return copy_regset_to_user(child, task_user_regset_view(child), + REGSET_FP_LEGACY, + 0, sizeof(struct user_i387_struct), + datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ - ret = set_fpregs(datap, child); - break; + return copy_regset_from_user(child, task_user_regset_view(child), + REGSET_FP_LEGACY, + 0, sizeof(struct user_i387_struct), + datap); case PTRACE_GETFPXREGS: /* Get the child FPU state. */ - ret = get_fpxregs(datap, child); - break; + return copy_regset_to_user(child, task_user_regset_view(child), + REGSET_FP, + 0, sizeof(struct user_fxsr_struct), + datap); case PTRACE_SETFPXREGS: /* Set the child FPU state. */ - ret = set_fpxregs(datap, child); - break; + return copy_regset_from_user(child, task_user_regset_view(child), + REGSET_FP, + 0, sizeof(struct user_fxsr_struct), + datap); default: ret = -EIO; } diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index f8bbad29cd0b..e0d4120a45c8 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -8,6 +8,7 @@ #include #include #include +#include #define __FRAME_OFFSETS #include #include @@ -188,30 +189,6 @@ int peek_user(struct task_struct *child, long addr, long data) return put_user(tmp, (unsigned long *) data); } -static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) -{ - int n; - - n = copy_to_user(buf, &child->thread.regs.regs.fp, - sizeof(struct user_i387_struct)); - if (n > 0) - return -EFAULT; - - return n; -} - -static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) -{ - int n; - - n = copy_from_user(&child->thread.regs.regs.fp, buf, - sizeof(struct user_i387_struct)); - if (n > 0) - return -EFAULT; - - return 0; -} - long subarch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { @@ -220,11 +197,15 @@ long subarch_ptrace(struct task_struct *child, long request, switch (request) { case PTRACE_GETFPREGS: /* Get the child FPU state. */ - ret = get_fpregs(datap, child); - break; + return copy_regset_to_user(child, task_user_regset_view(child), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ - ret = set_fpregs(datap, child); - break; + return copy_regset_from_user(child, task_user_regset_view(child), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); case PTRACE_ARCH_PRCTL: /* XXX Calls ptrace on the host - needs some SMP thinking */ ret = arch_prctl(child, data, (void __user *) addr); diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 6ca4ecabc55b..2dd4ca6713f8 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -56,12 +56,16 @@ struct syscall_args { UPT_SYSCALL_ARG5(r), \ UPT_SYSCALL_ARG6(r) } } ) +extern unsigned long host_fp_size; + struct uml_pt_regs { unsigned long gp[MAX_REG_NR]; - unsigned long fp[MAX_FP_NR]; struct faultinfo faultinfo; long syscall; int is_user; + + /* Dynamically sized FP registers (holds an XSTATE) */ + unsigned long fp[]; }; #define EMPTY_UML_PT_REGS { } @@ -72,4 +76,6 @@ struct uml_pt_regs { extern int user_context(unsigned long sp); +extern int arch_init_registers(int pid); + #endif /* __SYSDEP_X86_PTRACE_H */ diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h index 0c4989842fbe..2392470cac4d 100644 --- a/arch/x86/um/shared/sysdep/ptrace_32.h +++ b/arch/x86/um/shared/sysdep/ptrace_32.h @@ -6,8 +6,6 @@ #ifndef __SYSDEP_I386_PTRACE_H #define __SYSDEP_I386_PTRACE_H -#define MAX_FP_NR HOST_FPX_SIZE - #define UPT_SYSCALL_ARG1(r) UPT_BX(r) #define UPT_SYSCALL_ARG2(r) UPT_CX(r) #define UPT_SYSCALL_ARG3(r) UPT_DX(r) @@ -15,6 +13,4 @@ #define UPT_SYSCALL_ARG5(r) UPT_DI(r) #define UPT_SYSCALL_ARG6(r) UPT_BP(r) -extern void arch_init_registers(int pid); - #endif diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h index 0dc223aa1c2d..e73573ac871f 100644 --- a/arch/x86/um/shared/sysdep/ptrace_64.h +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -8,8 +8,6 @@ #ifndef __SYSDEP_X86_64_PTRACE_H #define __SYSDEP_X86_64_PTRACE_H -#define MAX_FP_NR HOST_FP_SIZE - #define REGS_R8(r) ((r)[HOST_R8]) #define REGS_R9(r) ((r)[HOST_R9]) #define REGS_R10(r) ((r)[HOST_R10]) @@ -57,6 +55,4 @@ #define UPT_SYSCALL_ARG5(r) UPT_R8(r) #define UPT_SYSCALL_ARG6(r) UPT_R9(r) -extern void arch_init_registers(int pid); - #endif diff --git a/arch/x86/um/shared/sysdep/ptrace_user.h b/arch/x86/um/shared/sysdep/ptrace_user.h index 1d1a824fa652..98da23120538 100644 --- a/arch/x86/um/shared/sysdep/ptrace_user.h +++ b/arch/x86/um/shared/sysdep/ptrace_user.h @@ -11,12 +11,6 @@ #define REGS_IP_INDEX HOST_IP #define REGS_SP_INDEX HOST_SP -#ifdef __i386__ -#define FP_SIZE ((HOST_FPX_SIZE > HOST_FP_SIZE) ? HOST_FPX_SIZE : HOST_FP_SIZE) -#else -#define FP_SIZE HOST_FP_SIZE -#endif - /* * glibc before 2.27 does not include PTRACE_SYSEMU_SINGLESTEP in its enum, * ensure we have a definition by (re-)defining it here. diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index fa78c74e00cb..94b2b2bbae31 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -16,145 +16,24 @@ #include #include -#ifdef CONFIG_X86_32 - -/* - * FPU tag word conversions. - */ - -static inline unsigned short twd_i387_to_fxsr(unsigned short twd) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - return tmp; -} - -static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) -{ - struct _fpxreg *st = NULL; - unsigned long twd = (unsigned long) fxsave->twd; - unsigned long tag; - unsigned long ret = 0xffff0000; - int i; - -#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16) - - for (i = 0; i < 8; i++) { - if (twd & 0x1) { - st = (struct _fpxreg *) FPREG_ADDR(fxsave, i); - - switch (st->exponent & 0x7fff) { - case 0x7fff: - tag = 2; /* Special */ - break; - case 0x0000: - if ( !st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3] ) { - tag = 1; /* Zero */ - } else { - tag = 2; /* Special */ - } - break; - default: - if (st->significand[3] & 0x8000) { - tag = 0; /* Valid */ - } else { - tag = 2; /* Special */ - } - break; - } - } else { - tag = 3; /* Empty */ - } - ret |= (tag << (2 * i)); - twd = twd >> 1; - } - return ret; -} - -static int convert_fxsr_to_user(struct _fpstate __user *buf, - struct user_fxsr_struct *fxsave) -{ - unsigned long env[7]; - struct _fpreg __user *to; - struct _fpxreg *from; - int i; - - env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; - env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; - env[2] = twd_fxsr_to_i387(fxsave); - env[3] = fxsave->fip; - env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); - env[5] = fxsave->foo; - env[6] = fxsave->fos; - - if (__copy_to_user(buf, env, 7 * sizeof(unsigned long))) - return 1; - - to = &buf->_st[0]; - from = (struct _fpxreg *) &fxsave->st_space[0]; - for (i = 0; i < 8; i++, to++, from++) { - unsigned long __user *t = (unsigned long __user *)to; - unsigned long *f = (unsigned long *)from; - - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; - } - return 0; -} - -static int convert_fxsr_from_user(struct user_fxsr_struct *fxsave, - struct _fpstate __user *buf) -{ - unsigned long env[7]; - struct _fpxreg *to; - struct _fpreg __user *from; - int i; - - if (copy_from_user( env, buf, 7 * sizeof(long))) - return 1; - - fxsave->cwd = (unsigned short)(env[0] & 0xffff); - fxsave->swd = (unsigned short)(env[1] & 0xffff); - fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); - fxsave->fip = env[3]; - fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); - fxsave->fcs = (env[4] & 0xffff); - fxsave->foo = env[5]; - fxsave->fos = env[6]; - - to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; - for (i = 0; i < 8; i++, to++, from++) { - unsigned long *t = (unsigned long *)to; - unsigned long __user *f = (unsigned long __user *)from; - - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; - } - return 0; -} - -extern int have_fpx_regs; +#include +#include +#ifdef CONFIG_X86_32 +struct _xstate_64 { + struct _fpstate_64 fpstate; + struct _header xstate_hdr; + struct _ymmh_state ymmh; + /* New processor state extensions go here: */ +}; +#else +#define _xstate_64 _xstate #endif static int copy_sc_from_user(struct pt_regs *regs, struct sigcontext __user *from) { + struct _xstate_64 __user *from_fp64; struct sigcontext sc; int err; @@ -203,31 +82,26 @@ static int copy_sc_from_user(struct pt_regs *regs, #undef GETREG #ifdef CONFIG_X86_32 - if (have_fpx_regs) { - struct user_fxsr_struct *fpx; - fpx = (void *)®s->regs.fp; - - err = convert_fxsr_from_user(fpx, (void *)sc.fpstate); - if (err) - return 1; - } else { - BUILD_BUG_ON(sizeof(regs->regs.fp) > sizeof(struct _fpstate)); - err = copy_from_user(regs->regs.fp, (void *)sc.fpstate, - sizeof(regs->regs.fp)); - if (err) - return 1; - } + from_fp64 = ((void *)sc.fpstate) + offsetof(struct _fpstate_32, _fxsr_env); #else - { - /* FIXME: Save/restore extended state (past the 16 YMM regs) */ - BUILD_BUG_ON(sizeof(regs->regs.fp) < sizeof(struct _xstate)); + from_fp64 = (void *)sc.fpstate; +#endif - err = copy_from_user(regs->regs.fp, (void *)sc.fpstate, - sizeof(struct _xstate)); - if (err) - return 1; - } + err = copy_from_user(regs->regs.fp, from_fp64, host_fp_size); + if (err) + return 1; + +#ifdef CONFIG_X86_32 + /* Data is duplicated and this copy is the important one */ + err = copy_regset_from_user(current, + task_user_regset_view(current), + REGSET_FP_LEGACY, 0, + sizeof(struct user_i387_struct), + (void *)sc.fpstate); + if (err < 0) + return err; #endif + return 0; } @@ -235,6 +109,7 @@ static int copy_sc_to_user(struct sigcontext __user *to, struct _xstate __user *to_fp, struct pt_regs *regs, unsigned long mask) { + struct _xstate_64 __user *to_fp64; struct sigcontext sc; struct faultinfo * fi = ¤t->thread.arch.faultinfo; int err; @@ -286,36 +161,44 @@ static int copy_sc_to_user(struct sigcontext __user *to, return 1; #ifdef CONFIG_X86_32 - if (have_fpx_regs) { - struct user_fxsr_struct *fpx; - - fpx = (void *)®s->regs.fp; + err = copy_regset_to_user(current, + task_user_regset_view(current), + REGSET_FP_LEGACY, 0, + sizeof(struct _fpstate_32), to_fp); + if (err < 0) + return err; - err = convert_fxsr_to_user(&to_fp->fpstate, fpx); - if (err) - return 1; + __put_user(X86_FXSR_MAGIC, &to_fp->fpstate.magic); - err |= __put_user(fpx->swd, &to_fp->fpstate.status); - err |= __put_user(X86_FXSR_MAGIC, &to_fp->fpstate.magic); - if (err) - return 1; - - } else { - if (copy_to_user(to_fp, regs->regs.fp, sizeof(regs->regs.fp))) - return 1; - - /* Need to fill in the sw_reserved bits ... */ - BUILD_BUG_ON(offsetof(typeof(*to_fp), - fpstate.sw_reserved.magic1) >= - sizeof(struct _fpstate)); - __put_user(0, &to_fp->fpstate.sw_reserved.magic1); - __put_user(sizeof(struct _fpstate), - &to_fp->fpstate.sw_reserved.extended_size); - } + BUILD_BUG_ON(offsetof(struct _xstate, xstate_hdr) != + offsetof(struct _xstate_64, xstate_hdr) + + offsetof(struct _fpstate_32, _fxsr_env)); + to_fp64 = (void *)to_fp + offsetof(struct _fpstate_32, _fxsr_env); #else - if (copy_to_user(to_fp, regs->regs.fp, sizeof(struct _xstate))) + to_fp64 = to_fp; +#endif /* CONFIG_X86_32 */ + + if (copy_to_user(to_fp64, regs->regs.fp, host_fp_size)) return 1; + + /* + * Put magic/size values for userspace. We do not bother to verify them + * later on, however, userspace needs them should it try to read the + * XSTATE data. And ptrace does not fill in these parts. + */ + BUILD_BUG_ON(sizeof(int) != FP_XSTATE_MAGIC2_SIZE); +#ifdef CONFIG_X86_32 + __put_user(offsetof(struct _fpstate_32, _fxsr_env) + + host_fp_size + FP_XSTATE_MAGIC2_SIZE, + &to_fp64->fpstate.sw_reserved.extended_size); +#else + __put_user(host_fp_size + FP_XSTATE_MAGIC2_SIZE, + &to_fp64->fpstate.sw_reserved.extended_size); #endif + __put_user(host_fp_size, &to_fp64->fpstate.sw_reserved.xstate_size); + + __put_user(FP_XSTATE_MAGIC1, &to_fp64->fpstate.sw_reserved.magic1); + __put_user(FP_XSTATE_MAGIC2, (int *)((void *)to_fp64 + host_fp_size)); return 0; } @@ -333,34 +216,15 @@ static int copy_ucontext_to_user(struct ucontext __user *uc, return err; } -struct sigframe -{ - char __user *pretcode; - int sig; - struct sigcontext sc; - struct _xstate fpstate; - unsigned long extramask[_NSIG_WORDS-1]; - char retcode[8]; -}; - -struct rt_sigframe -{ - char __user *pretcode; - int sig; - struct siginfo __user *pinfo; - void __user *puc; - struct siginfo info; - struct ucontext uc; - struct _xstate fpstate; - char retcode[8]; -}; - int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig, struct pt_regs *regs, sigset_t *mask) { + size_t math_size = offsetof(struct _fpstate_32, _fxsr_env) + + host_fp_size + FP_XSTATE_MAGIC2_SIZE; struct sigframe __user *frame; void __user *restorer; int err = 0, sig = ksig->sig; + unsigned long fp_to; /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ stack_top = ((stack_top + 4) & -16UL) - 4; @@ -368,13 +232,21 @@ int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig, if (!access_ok(frame, sizeof(*frame))) return 1; + /* Add required space for math frame */ + frame = (struct sigframe __user *)((unsigned long)frame - math_size); + restorer = frame->retcode; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - err |= __put_user(restorer, &frame->pretcode); + err |= __put_user(restorer, (void **)&frame->pretcode); err |= __put_user(sig, &frame->sig); - err |= copy_sc_to_user(&frame->sc, &frame->fpstate, regs, mask->sig[0]); + + fp_to = (unsigned long)frame + sizeof(*frame); + + err |= copy_sc_to_user(&frame->sc, + (struct _xstate __user *)fp_to, + regs, mask->sig[0]); if (_NSIG_WORDS > 1) err |= __copy_to_user(&frame->extramask, &mask->sig[1], sizeof(frame->extramask)); @@ -404,26 +276,35 @@ int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig, int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, struct pt_regs *regs, sigset_t *mask) { + size_t math_size = offsetof(struct _fpstate_32, _fxsr_env) + + host_fp_size + FP_XSTATE_MAGIC2_SIZE; struct rt_sigframe __user *frame; void __user *restorer; int err = 0, sig = ksig->sig; + unsigned long fp_to; stack_top &= -8UL; frame = (struct rt_sigframe __user *) stack_top - 1; if (!access_ok(frame, sizeof(*frame))) return 1; + /* Add required space for math frame */ + frame = (struct rt_sigframe __user *)((unsigned long)frame - math_size); + restorer = frame->retcode; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - err |= __put_user(restorer, &frame->pretcode); + err |= __put_user(restorer, (void **)&frame->pretcode); err |= __put_user(sig, &frame->sig); - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); + err |= __put_user(&frame->info, (void **)&frame->pinfo); + err |= __put_user(&frame->uc, (void **)&frame->puc); err |= copy_siginfo_to_user(&frame->info, &ksig->info); - err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, - PT_REGS_SP(regs)); + + fp_to = (unsigned long)frame + sizeof(*frame); + + err |= copy_ucontext_to_user(&frame->uc, (struct _xstate __user *)fp_to, + mask, PT_REGS_SP(regs)); /* * This is movl $,%eax ; int $0x80 @@ -475,27 +356,24 @@ SYSCALL_DEFINE0(sigreturn) #else -struct rt_sigframe -{ - char __user *pretcode; - struct ucontext uc; - struct siginfo info; - struct _xstate fpstate; -}; - int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, struct pt_regs *regs, sigset_t *set) { + unsigned long math_size = host_fp_size + FP_XSTATE_MAGIC2_SIZE; struct rt_sigframe __user *frame; int err = 0, sig = ksig->sig; unsigned long fp_to; frame = (struct rt_sigframe __user *) round_down(stack_top - sizeof(struct rt_sigframe), 16); + + /* Add required space for math frame */ + frame = (struct rt_sigframe __user *)((unsigned long)frame - math_size); + /* Subtract 128 for a red zone and 8 for proper alignment */ frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); - if (!access_ok(frame, sizeof(*frame))) + if (!access_ok(frame, sizeof(*frame) + math_size)) goto out; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { @@ -508,10 +386,12 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs)); - err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, - set->sig[0]); - fp_to = (unsigned long)&frame->fpstate; + fp_to = (unsigned long)frame + sizeof(*frame); + + err |= copy_sc_to_user(&frame->uc.uc_mcontext, + (struct _xstate __user *)fp_to, + regs, set->sig[0]); err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate); if (sizeof(*set) == 16) { diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index 1c77d9946199..d6e1cd9956bf 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -20,9 +20,6 @@ void foo(void); void foo(void) { #ifdef __i386__ - DEFINE_LONGS(HOST_FP_SIZE, sizeof(struct user_fpregs_struct)); - DEFINE_LONGS(HOST_FPX_SIZE, sizeof(struct user_fpxregs_struct)); - DEFINE(HOST_IP, EIP); DEFINE(HOST_SP, UESP); DEFINE(HOST_EFLAGS, EFL); @@ -41,11 +38,6 @@ void foo(void) DEFINE(HOST_GS, GS); DEFINE(HOST_ORIG_AX, ORIG_EAX); #else -#ifdef FP_XSTATE_MAGIC1 - DEFINE_LONGS(HOST_FP_SIZE, 2696); -#else - DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); -#endif DEFINE_LONGS(HOST_BX, RBX); DEFINE_LONGS(HOST_CX, RCX); DEFINE_LONGS(HOST_DI, RDI); -- cgit From d61ac4a7496a7981947b8b894d40b0e35c316fa5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 24 Oct 2024 09:52:51 +0200 Subject: um: remove PATH_MAX use Evidently, PATH_MAX isn't always defined, at least not via . Simply remove the use and replace it by a constant 4k. As stat::st_size is zero for /proc/self/exe we can't even size it automatically, and it seems unlikely someone's going to try to run UML with such a path. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410240553.gYNIXN8i-lkp@intel.com/ Fixes: 031acdcfb566 ("um: restore process name") Signed-off-by: Johannes Berg --- arch/um/os-Linux/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index 5e0cba5aee93..0afcdeb8995b 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -114,7 +113,7 @@ int __init main(int argc, char **argv, char **envp) ret = personality(PER_LINUX | ADDR_NO_RANDOMIZE); if (ret >= 0 && (ret & (PER_LINUX | ADDR_NO_RANDOMIZE)) != (PER_LINUX | ADDR_NO_RANDOMIZE)) { - char buf[PATH_MAX] = {}; + char buf[4096] = {}; ssize_t ret; ret = readlink("/proc/self/exe", buf, sizeof(buf)); -- cgit From 4e5adbe447db382cc76e05613581f96aef4f91d2 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 24 Oct 2024 22:28:25 +0800 Subject: um: Add os_set_pdeathsig helper function This helper can be used to set the parent-death signal of the calling process to SIGKILL to ensure that the process will be killed if the UML kernel dies unexpectedly without proper cleanup. This helper will be used in the follow-up patches. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241024142828.2612828-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 2 ++ arch/um/os-Linux/process.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 09f8201de5db..d709a24dc6fc 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -216,6 +216,8 @@ extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); extern int os_mincore(void *addr, unsigned long len); +void os_set_pdeathsig(void); + /* execvp.c */ extern int execvp_noalloc(char *buf, const char *file, char *const argv[]); /* helper.c */ diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index f20602e793d9..9f086f939420 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -203,3 +204,8 @@ void init_new_thread_signals(void) set_handler(SIGIO); signal(SIGWINCH, SIG_IGN); } + +void os_set_pdeathsig(void) +{ + prctl(PR_SET_PDEATHSIG, SIGKILL); +} -- cgit From 9b5e6c0f5a9199c69af81ac5bedc512ee7dc20b3 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 24 Oct 2024 22:28:26 +0800 Subject: um: Set parent-death signal for ubd io thread/process The ubd io thread is not really a traditional thread. Set the parent-death signal for it to ensure that it will be killed if the UML kernel dies unexpectedly without proper cleanup. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241024142828.2612828-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/ubd_kern.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 7f28ec1929dc..2b8d04e67600 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1499,6 +1499,7 @@ int io_thread(void *arg) { int n, count, written, res; + os_set_pdeathsig(); os_fix_helper_signals(); while(1){ -- cgit From c6c4adee65969218b0b7b13f568fd2c6f2333373 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 24 Oct 2024 22:28:27 +0800 Subject: um: Set parent-death signal for write_sigio thread/process The write_sigio thread is not really a traditional thread. Set the parent-death signal for it to ensure that it will be killed if the UML kernel dies unexpectedly without proper cleanup. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241024142828.2612828-4-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/os-Linux/sigio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 9e71794839e8..9aac8def4d63 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -55,6 +55,7 @@ static int write_sigio_thread(void *unused) int i, n, respond_fd; char c; + os_set_pdeathsig(); os_fix_helper_signals(); fds = ¤t_poll; while (1) { -- cgit From 42b8b00c8ab1ac18fccde3f29ee589626a561ea7 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 24 Oct 2024 22:28:28 +0800 Subject: um: Use os_set_pdeathsig helper in winch thread/process Since we have a helper now, let's switch to using it. It will make the code slightly more consistent. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241024142828.2612828-5-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/chan_user.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index 1434114b2f34..35f9beeb19b3 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -10,7 +10,6 @@ #include #include #include -#include #include "chan_user.h" #include #include @@ -162,7 +161,7 @@ static __noreturn int winch_thread(void *arg) int count; char c = 1; - prctl(PR_SET_PDEATHSIG, SIGKILL); + os_set_pdeathsig(); pty_fd = data->pty_fd; pipe_fd = data->pipe_fd; -- cgit From d3b08e5f3f2829943342b88d3e2b44fb0ccdccab Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Oct 2024 10:27:01 +0200 Subject: um: fix stub exe build with CONFIG_GCOV CONFIG_GCOV is special and only in UML since it builds the kernel with a "userspace" option. This is fine, but the stub is even more special and not really a full userspace process, so it then fails to link as reported. Remove the GCOV options from the stub build. For good measure, also remove the GPROF options, even though they don't seem to cause build failures now. To be able to do this, export the specific options (GCOV_OPT and GPROF_OPT) but rename them so there's less chance of any conflicts. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410242238.SXhs2kQ4-lkp@intel.com/ Fixes: 32e8eaf263d9 ("um: use execveat to create userspace MMs") Link: https://patch.msgid.link/20241025102700.9fbb9c34473f.I7f1537fe075638f8da64beb52ef6c9e5adc51bc3@changeid Signed-off-by: Johannes Berg --- arch/um/Makefile-skas | 14 +++++++------- arch/um/kernel/skas/Makefile | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/um/Makefile-skas b/arch/um/Makefile-skas index 67323b028999..1a27e65bcb9c 100644 --- a/arch/um/Makefile-skas +++ b/arch/um/Makefile-skas @@ -3,15 +3,15 @@ # Licensed under the GPL # -GPROF_OPT += -pg +export UM_GPROF_OPT += -pg ifdef CONFIG_CC_IS_CLANG -GCOV_OPT += -fprofile-instr-generate -fcoverage-mapping +export UM_GCOV_OPT += -fprofile-instr-generate -fcoverage-mapping else -GCOV_OPT += -fprofile-arcs -ftest-coverage +export UM_GCOV_OPT += -fprofile-arcs -ftest-coverage endif -CFLAGS-$(CONFIG_GCOV) += $(GCOV_OPT) -CFLAGS-$(CONFIG_GPROF) += $(GPROF_OPT) -LINK-$(CONFIG_GCOV) += $(GCOV_OPT) -LINK-$(CONFIG_GPROF) += $(GPROF_OPT) +CFLAGS-$(CONFIG_GCOV) += $(UM_GCOV_OPT) +CFLAGS-$(CONFIG_GPROF) += $(UM_GPROF_OPT) +LINK-$(CONFIG_GCOV) += $(UM_GCOV_OPT) +LINK-$(CONFIG_GPROF) += $(UM_GPROF_OPT) diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index f6a219074772..3384be42691f 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -24,7 +24,7 @@ $(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE quiet_cmd_stub_exe = STUB_EXE $@ cmd_stub_exe = $(CC) -nostdlib -o $@ \ - $(KBUILD_CFLAGS) $(STUB_EXE_LDFLAGS) \ + $(filter-out $(UM_GPROF_OPT) $(UM_GCOV_OPT),$(KBUILD_CFLAGS)) $(STUB_EXE_LDFLAGS) \ $(filter %.o,$^) STUB_EXE_LDFLAGS = -Wl,-n -static -- cgit From 0b0ad2541d8e465c82b69202bbc2b1efcac6b73d Mon Sep 17 00:00:00 2001 From: Shaojie Dong Date: Fri, 25 Oct 2024 17:02:37 +0800 Subject: um: Remove double zero check free_pages() performs a parameter null check inside therefore remove double zero check here. Signed-off-by: Shaojie Dong Link: https://patch.msgid.link/20241025-upstream_branch-v5-1-b8998beb2c64@quicinc.com Signed-off-by: Johannes Berg --- arch/um/kernel/skas/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index d3fb506d5bd6..0eb5a1d3ba70 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -46,8 +46,7 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) return 0; out_free: - if (new_id->stack != 0) - free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); + free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); out: return ret; } -- cgit From 32f1fde0b631b36417356f7896d5bc18c44886d3 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 31 Oct 2024 15:20:16 +0100 Subject: um: fix sparse warnings from regset refactor Some variables were not tagged with __user and another was not marked as static even though it should be. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410280655.gOlEFwdG-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202410281821.WSPsAwq7-lkp@intel.com/ Fixes: 3f17fed21491 ("um: switch to regset API and depend on XSTATE") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241031142017.430420-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/x86/um/ptrace.c | 2 +- arch/x86/um/signal.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c index 54d924bc45ce..57c504fd5626 100644 --- a/arch/x86/um/ptrace.c +++ b/arch/x86/um/ptrace.c @@ -242,7 +242,7 @@ static struct user_regset uml_regsets[] __ro_after_init = { /* TODO: Add TLS regset for 32bit */ }; -const struct user_regset_view user_uml_view = { +static const struct user_regset_view user_uml_view = { #ifdef CONFIG_X86_32 .name = "i386", .e_machine = EM_386, #else diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 94b2b2bbae31..797c4e4e57f3 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -82,9 +82,10 @@ static int copy_sc_from_user(struct pt_regs *regs, #undef GETREG #ifdef CONFIG_X86_32 - from_fp64 = ((void *)sc.fpstate) + offsetof(struct _fpstate_32, _fxsr_env); + from_fp64 = ((void __user *)sc.fpstate) + + offsetof(struct _fpstate_32, _fxsr_env); #else - from_fp64 = (void *)sc.fpstate; + from_fp64 = (void __user *)sc.fpstate; #endif err = copy_from_user(regs->regs.fp, from_fp64, host_fp_size); @@ -97,7 +98,7 @@ static int copy_sc_from_user(struct pt_regs *regs, task_user_regset_view(current), REGSET_FP_LEGACY, 0, sizeof(struct user_i387_struct), - (void *)sc.fpstate); + (void __user *)sc.fpstate); if (err < 0) return err; #endif @@ -173,7 +174,8 @@ static int copy_sc_to_user(struct sigcontext __user *to, BUILD_BUG_ON(offsetof(struct _xstate, xstate_hdr) != offsetof(struct _xstate_64, xstate_hdr) + offsetof(struct _fpstate_32, _fxsr_env)); - to_fp64 = (void *)to_fp + offsetof(struct _fpstate_32, _fxsr_env); + to_fp64 = (void __user *)to_fp + + offsetof(struct _fpstate_32, _fxsr_env); #else to_fp64 = to_fp; #endif /* CONFIG_X86_32 */ @@ -198,7 +200,8 @@ static int copy_sc_to_user(struct sigcontext __user *to, __put_user(host_fp_size, &to_fp64->fpstate.sw_reserved.xstate_size); __put_user(FP_XSTATE_MAGIC1, &to_fp64->fpstate.sw_reserved.magic1); - __put_user(FP_XSTATE_MAGIC2, (int *)((void *)to_fp64 + host_fp_size)); + __put_user(FP_XSTATE_MAGIC2, + (int __user *)((void __user *)to_fp64 + host_fp_size)); return 0; } -- cgit From 81e0679d851ad7ddee8904df59f84eabfc2c1b1e Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 31 Oct 2024 15:20:17 +0100 Subject: um: fix sparse warnings in signal code sparse reports that various places were missing the __user tag in casts. In addition, one location was using 0 instead of NULL. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241031142017.430420-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/x86/um/signal.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 797c4e4e57f3..75087e85b6fd 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -242,7 +242,7 @@ int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - err |= __put_user(restorer, (void **)&frame->pretcode); + err |= __put_user(restorer, (void __user * __user *)&frame->pretcode); err |= __put_user(sig, &frame->sig); fp_to = (unsigned long)frame + sizeof(*frame); @@ -298,10 +298,10 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - err |= __put_user(restorer, (void **)&frame->pretcode); + err |= __put_user(restorer, (void __user * __user *)&frame->pretcode); err |= __put_user(sig, &frame->sig); - err |= __put_user(&frame->info, (void **)&frame->pinfo); - err |= __put_user(&frame->uc, (void **)&frame->puc); + err |= __put_user(&frame->info, (void __user * __user *)&frame->pinfo); + err |= __put_user(&frame->uc, (void __user * __user *)&frame->puc); err |= copy_siginfo_to_user(&frame->info, &ksig->info); fp_to = (unsigned long)frame + sizeof(*frame); @@ -387,7 +387,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(NULL, &frame->uc.uc_link); err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs)); fp_to = (unsigned long)frame + sizeof(*frame); @@ -411,7 +411,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, */ /* x86-64 should always use SA_RESTORER. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) - err |= __put_user((void *)ksig->ka.sa.sa_restorer, + err |= __put_user((void __user *)ksig->ka.sa.sa_restorer, &frame->pretcode); else /* could use a vstub here */ -- cgit From fce0128863b22dd6d9a4e5383645ac70656970a7 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 3 Nov 2024 16:05:02 +0100 Subject: um: set DONTDUMP and DONTFORK flags on KASAN shadow memory There is no point in either dumping the KASAN shadow memory or doing copy-on-write after a fork on these memory regions. This considerably speeds up coredump generation. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241103150506.1367695-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/os-Linux/mem.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index 857e3deab293..72f302f4d197 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -39,6 +39,18 @@ void kasan_map_memory(void *start, size_t len) strerror(errno)); exit(1); } + + if (madvise(start, len, MADV_DONTDUMP)) { + os_info("Couldn't set MAD_DONTDUMP on shadow memory: %s\n.", + strerror(errno)); + exit(1); + } + + if (madvise(start, len, MADV_DONTFORK)) { + os_info("Couldn't set MADV_DONTFORK on shadow memory: %s\n.", + strerror(errno)); + exit(1); + } } /* Set by make_tempfile() during early boot. */ -- cgit From 2f278b59574ae222a14e4ae59964cb47edfeadd1 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 3 Nov 2024 16:05:03 +0100 Subject: um: always include kconfig.h and compiler-version.h Since commit a95b37e20db9 ("kbuild: get out of ") we can safely include these files in userspace code. Doing so simplifies matters as options do not need to be exported via asm-offsets.h anymore. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241103150506.1367695-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/Makefile | 4 +++- arch/um/include/shared/common-offsets.h | 18 ------------------ arch/um/include/shared/timetravel.h | 5 ++--- arch/um/include/shared/user.h | 2 +- arch/um/os-Linux/signal.c | 8 ++++---- arch/um/os-Linux/skas/process.c | 6 ++++-- arch/um/os-Linux/util.c | 4 ++-- 7 files changed, 16 insertions(+), 31 deletions(-) diff --git a/arch/um/Makefile b/arch/um/Makefile index 31e367e8ab4d..1d36a613aad8 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -71,7 +71,9 @@ KBUILD_AFLAGS += $(ARCH_INCLUDE) USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \ $(ARCH_INCLUDE) $(MODE_INCLUDE) $(filter -I%,$(CFLAGS)) \ -D_FILE_OFFSET_BITS=64 -idirafter $(srctree)/include \ - -idirafter $(objtree)/include -D__KERNEL__ -D__UM_HOST__ + -idirafter $(objtree)/include -D__KERNEL__ -D__UM_HOST__ \ + -include $(srctree)/include/linux/compiler-version.h \ + -include $(srctree)/include/linux/kconfig.h #This will adjust *FLAGS accordingly to the platform. include $(srctree)/$(ARCH_DIR)/Makefile-os-Linux diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 86537e20942a..1d00fc6b6e92 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -15,21 +15,3 @@ DEFINE(UM_THREAD_SIZE, THREAD_SIZE); DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); - -#ifdef CONFIG_PRINTK -DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK); -#endif -#ifdef CONFIG_UML_X86 -DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); -#endif -#ifdef CONFIG_64BIT -DEFINE(UML_CONFIG_64BIT, CONFIG_64BIT); -#endif -#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT -DEFINE(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT, CONFIG_UML_TIME_TRAVEL_SUPPORT); -#endif -#ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS -DEFINE(UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS, CONFIG_UML_MAX_USERSPACE_ITERATIONS); -#else -DEFINE(UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS, 0); -#endif diff --git a/arch/um/include/shared/timetravel.h b/arch/um/include/shared/timetravel.h index c8db2f213dba..7c2b277b7eb0 100644 --- a/arch/um/include/shared/timetravel.h +++ b/arch/um/include/shared/timetravel.h @@ -12,14 +12,13 @@ enum time_travel_mode { TT_MODE_EXTERNAL, }; -#if defined(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT) || \ - defined(CONFIG_UML_TIME_TRAVEL_SUPPORT) +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) extern enum time_travel_mode time_travel_mode; extern int time_travel_should_print_bc_msg; #else #define time_travel_mode TT_MODE_OFF #define time_travel_should_print_bc_msg 0 -#endif /* (UML_)CONFIG_UML_TIME_TRAVEL_SUPPORT */ +#endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ void _time_travel_print_bc_msg(void); static inline void time_travel_print_bc_msg(void) diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h index bbab79c0c074..139eb78a4767 100644 --- a/arch/um/include/shared/user.h +++ b/arch/um/include/shared/user.h @@ -38,7 +38,7 @@ extern void panic(const char *fmt, ...) #define UM_KERN_DEBUG KERN_DEBUG #define UM_KERN_CONT KERN_CONT -#ifdef UML_CONFIG_PRINTK +#if IS_ENABLED(CONFIG_PRINTK) #define printk(...) _printk(__VA_ARGS__) extern int _printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 1978eaa557e9..1c6caa9dbd6c 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -65,7 +65,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) #define SIGALRM_MASK (1 << SIGALRM_BIT) int signals_enabled; -#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) static int signals_blocked, signals_blocked_pending; #endif static unsigned int signals_pending; @@ -75,7 +75,7 @@ static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) { int enabled = signals_enabled; -#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) if ((signals_blocked || __atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) && (sig == SIGIO)) { @@ -297,7 +297,7 @@ void unblock_signals(void) return; signals_enabled = 1; -#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) deliver_time_travel_irqs(); #endif @@ -389,7 +389,7 @@ int um_set_signals_trace(int enable) return ret; } -#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) void mark_sigio_pending(void) { /* diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 97856955e892..f683cfc9e51a 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -413,12 +413,14 @@ void userspace(struct uml_pt_regs *regs) */ if (time_travel_mode == TT_MODE_INFCPU || time_travel_mode == TT_MODE_EXTERNAL) { - if (UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS && +#ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS + if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && unscheduled_userspace_iterations++ > - UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS) { + CONFIG_UML_MAX_USERSPACE_ITERATIONS) { tt_extra_sched_jiffies += 1; unscheduled_userspace_iterations = 0; } +#endif } time_travel_print_bc_msg(); diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 1dca4ffbd572..4193e04d7e4a 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -52,8 +52,8 @@ void setup_machinename(char *machine_out) struct utsname host; uname(&host); -#ifdef UML_CONFIG_UML_X86 -# ifndef UML_CONFIG_64BIT +#if IS_ENABLED(CONFIG_UML_X86) +# if !IS_ENABLED(CONFIG_64BIT) if (!strcmp(host.machine, "x86_64")) { strcpy(machine_out, "i686"); return; -- cgit From 37c691151e52f7762afa147ffb6e412ee0b5e8ac Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 3 Nov 2024 16:05:04 +0100 Subject: um: remove file sync for stub data There is no need to sync the stub code to "disk" for the other process to see the correct memory. Drop the fsync there and remove the helper function. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241103150506.1367695-3-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 1 - arch/um/kernel/physmem.c | 1 - arch/um/os-Linux/file.c | 6 ------ 3 files changed, 8 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index d709a24dc6fc..6c656ef096c9 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -145,7 +145,6 @@ extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg); extern int os_get_ifname(int fd, char *namebuf); extern int os_set_slip(int fd); extern int os_mode_fd(int fd, int mode); -extern int os_fsync_file(int fd); extern int os_seek_file(int fd, unsigned long long offset); extern int os_open_file(const char *file, struct openflags flags, int mode); diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index d60df3626727..a74f17b033c4 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -101,7 +101,6 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, */ os_seek_file(physmem_fd, __pa(__syscall_stub_start)); os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); - os_fsync_file(physmem_fd); memblock_add(__pa(start), len); memblock_reserve(__pa(start), reserve); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index f1d03cf3957f..a0d01c68ce3e 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -255,12 +255,6 @@ void os_close_file(int fd) { close(fd); } -int os_fsync_file(int fd) -{ - if (fsync(fd) < 0) - return -errno; - return 0; -} int os_seek_file(int fd, unsigned long long offset) { -- cgit From b69f22dfd6973ec54da5e2b8fd1f1cd138b533b0 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 3 Nov 2024 16:05:05 +0100 Subject: um: remove duplicate UM_NSEC_PER_SEC definition Just remove the first entry as there is a second later on. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241103150506.1367695-4-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/common-offsets.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 1d00fc6b6e92..73f3a4792ed8 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -6,7 +6,6 @@ DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE); DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE); DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK); DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT); -DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); DEFINE(UM_GFP_KERNEL, GFP_KERNEL); DEFINE(UM_GFP_ATOMIC, GFP_ATOMIC); -- cgit From ce6e85a186c28ab0ca024580cba93fa19147c72b Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 3 Nov 2024 16:05:06 +0100 Subject: um: remove broken double fault detection The show_stack function had some code to detect double faults. However, the logic is wrong and it would e.g. trigger if a WARNING happened inside an IRQ. Remove it without trying to add a new logic. The current behaviour, which will just fault repeatedly until the IRQ stack is used up and the host kills UML, seems to be good enough. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241103150506.1367695-5-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 1 - arch/um/kernel/sysrq.c | 6 ------ arch/um/os-Linux/signal.c | 8 -------- 3 files changed, 15 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 6c656ef096c9..5babad8c5f75 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -241,7 +241,6 @@ extern void block_signals(void); extern void unblock_signals(void); extern int um_set_signals(int enable); extern int um_set_signals_trace(int enable); -extern int os_is_signal_stack(void); extern void deliver_alarm(void); extern void register_pm_wake_signal(void); extern void block_signals_hard(void); diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 4bb8622dc512..a3bdab048617 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -32,12 +32,6 @@ void show_stack(struct task_struct *task, unsigned long *stack, struct pt_regs *segv_regs = current->thread.segv_regs; int i; - if (!segv_regs && os_is_signal_stack()) { - pr_err("Received SIGSEGV in SIGSEGV handler," - " aborting stack trace!\n"); - return; - } - if (!stack) stack = get_stack_pointer(task, segv_regs); diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 1c6caa9dbd6c..52852018a3ad 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -487,11 +487,3 @@ void unblock_signals_hard(void) unblocking = false; } #endif - -int os_is_signal_stack(void) -{ - stack_t ss; - sigaltstack(NULL, &ss); - - return ss.ss_flags & SS_ONSTACK; -} -- cgit From fcbd26d33dfa40496b6d82973bc49f95e6df21f9 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 3 Nov 2024 22:28:51 +0100 Subject: um: virtio_uml: send SET_MEM_TABLE message with the exact size The rust based userspace vhost devices are very strict and will not accept the message if it is longer than required. So, only include the data for the first memory region. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241103212854.1436046-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/drivers/virtio_uml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 4d3e9b9f5b61..c602892f329f 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -623,7 +623,7 @@ static int vhost_user_set_mem_table(struct virtio_uml_device *vu_dev) { struct vhost_user_msg msg = { .header.request = VHOST_USER_SET_MEM_TABLE, - .header.size = sizeof(msg.payload.mem_regions), + .header.size = offsetof(typeof(msg.payload.mem_regions), regions[1]), .payload.mem_regions.num = 1, }; unsigned long reserved = uml_reserved - uml_physmem; -- cgit From d85deadc17ee7e7154271cf1082bc0bb7aba3308 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 3 Nov 2024 22:28:53 +0100 Subject: um: virtio_uml: fix call_fd IRQ allocation If the device does not support slave requests, then the IRQ will not yet be allocated. So initialize the IRQ to UM_IRQ_ALLOC so that it will be allocated if none has been assigned yet and store it slightly later when we know that it will not be immediately unregistered again. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241103212854.1436046-4-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/drivers/virtio_uml.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index c602892f329f..0f9efee4c671 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -888,7 +888,7 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, { struct virtio_uml_vq_info *info = vq->priv; int call_fds[2]; - int rc; + int rc, irq; /* no call FD needed/desired in this case */ if (vu_dev->protocol_features & @@ -905,19 +905,23 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, return rc; info->call_fd = call_fds[0]; - rc = um_request_irq(vu_dev->irq, info->call_fd, IRQ_READ, - vu_interrupt, IRQF_SHARED, info->name, vq); - if (rc < 0) + irq = um_request_irq(vu_dev->irq, info->call_fd, IRQ_READ, + vu_interrupt, IRQF_SHARED, info->name, vq); + if (irq < 0) { + rc = irq; goto close_both; + } rc = vhost_user_set_vring_call(vu_dev, vq->index, call_fds[1]); if (rc) goto release_irq; + vu_dev->irq = irq; + goto out; release_irq: - um_free_irq(vu_dev->irq, vq); + um_free_irq(irq, vq); close_both: os_close_file(call_fds[0]); out: @@ -1201,6 +1205,7 @@ static int virtio_uml_probe(struct platform_device *pdev) vu_dev->vdev.id.vendor = VIRTIO_DEV_ANY_ID; vu_dev->pdev = pdev; vu_dev->req_fd = -1; + vu_dev->irq = UM_IRQ_ALLOC; time_travel_propagate_time(); -- cgit From 1d4d0ef84a7f386205ad141ff8727e027b0581e4 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 3 Nov 2024 22:28:54 +0100 Subject: um: virtio_uml: query the number of vqs if supported When the VHOST_USER_PROTOCOL_F_MQ protocol feature flag is set, we can query the maximum number of virtual queues. Do so when supported and extend the check to verify that we are not trying to allocate more queues. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241103212854.1436046-5-benjamin@sipsolutions.net [add a message to the WARN_ON] Signed-off-by: Johannes Berg --- arch/um/drivers/vhost_user.h | 4 +++- arch/um/drivers/virtio_uml.c | 25 ++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/vhost_user.h b/arch/um/drivers/vhost_user.h index 6f147cd3c9f7..fcfa3b7e021b 100644 --- a/arch/um/drivers/vhost_user.h +++ b/arch/um/drivers/vhost_user.h @@ -10,6 +10,7 @@ /* Feature bits */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 /* Protocol feature bits */ +#define VHOST_USER_PROTOCOL_F_MQ 0 #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 #define VHOST_USER_PROTOCOL_F_CONFIG 9 @@ -23,7 +24,8 @@ /* Supported transport features */ #define VHOST_USER_SUPPORTED_F BIT_ULL(VHOST_USER_F_PROTOCOL_FEATURES) /* Supported protocol features */ -#define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ +#define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_MQ) | \ + BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG) | \ BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 0f9efee4c671..cc3be48a9d6e 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -56,6 +56,7 @@ struct virtio_uml_device { int sock, req_fd, irq; u64 features; u64 protocol_features; + u64 max_vqs; u8 status; u8 registered:1; u8 suspended:1; @@ -341,6 +342,17 @@ static int vhost_user_set_protocol_features(struct virtio_uml_device *vu_dev, protocol_features); } +static int vhost_user_get_queue_num(struct virtio_uml_device *vu_dev, + u64 *queue_num) +{ + int rc = vhost_user_send_no_payload(vu_dev, true, + VHOST_USER_GET_QUEUE_NUM); + + if (rc) + return rc; + return vhost_user_recv_u64(vu_dev, queue_num); +} + static void vhost_user_reply(struct virtio_uml_device *vu_dev, struct vhost_user_msg *msg, int response) { @@ -514,6 +526,15 @@ static int vhost_user_init(struct virtio_uml_device *vu_dev) return rc; } + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_MQ)) { + rc = vhost_user_get_queue_num(vu_dev, &vu_dev->max_vqs); + if (rc) + return rc; + } else { + vu_dev->max_vqs = U64_MAX; + } + return 0; } @@ -1018,7 +1039,9 @@ static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vq; /* not supported for now */ - if (WARN_ON(nvqs > 64)) + if (WARN(nvqs > 64 || nvqs > vu_dev->max_vqs, + "%d VQs requested, only up to 64 or %lld supported\n", + nvqs, vu_dev->max_vqs)) return -EINVAL; rc = vhost_user_set_mem_table(vu_dev); -- cgit From df700802abcac3c7c4a4ced099aa42b9a144eea8 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Tue, 5 Nov 2024 00:32:00 +0800 Subject: um: ubd: Initialize ubd's disk pointer in ubd_add Currently, the initialization of the disk pointer in the ubd structure is missing. It should be initialized with the allocated gendisk pointer in ubd_add(). Fixes: 32621ad7a7ea ("ubd: remove the ubd_gendisk array") Signed-off-by: Tiwei Bie Acked-By: Anton Ivanov Link: https://patch.msgid.link/20241104163203.435515-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/ubd_kern.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 2b8d04e67600..f19173da64d8 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -898,6 +898,8 @@ static int ubd_add(int n, char **error_out) if (err) goto out_cleanup_disk; + ubd_dev->disk = disk; + return 0; out_cleanup_disk: -- cgit From 5bee35e5389f450a7eea7318deb9073e9414d3b1 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Tue, 5 Nov 2024 00:32:01 +0800 Subject: um: ubd: Do not use drvdata in release The drvdata is not available in release. Let's just use container_of() to get the ubd instance. Otherwise, removing a ubd device will result in a crash: RIP: 0033:blk_mq_free_tag_set+0x1f/0xba RSP: 00000000e2083bf0 EFLAGS: 00010246 RAX: 000000006021463a RBX: 0000000000000348 RCX: 0000000062604d00 RDX: 0000000004208060 RSI: 00000000605241a0 RDI: 0000000000000348 RBP: 00000000e2083c10 R08: 0000000062414010 R09: 00000000601603f7 R10: 000000000000133a R11: 000000006038c4bd R12: 0000000000000000 R13: 0000000060213a5c R14: 0000000062405d20 R15: 00000000604f7aa0 Kernel panic - not syncing: Segfault with no mm CPU: 0 PID: 17 Comm: kworker/0:1 Not tainted 6.8.0-rc3-00107-gba3f67c11638 #1 Workqueue: events mc_work_proc Stack: 00000000 604f7ef0 62c5d000 62405d20 e2083c30 6002c776 6002c755 600e47ff e2083c60 6025ffe3 04208060 603d36e0 Call Trace: [<6002c776>] ubd_device_release+0x21/0x55 [<6002c755>] ? ubd_device_release+0x0/0x55 [<600e47ff>] ? kfree+0x0/0x100 [<6025ffe3>] device_release+0x70/0xba [<60381d6a>] kobject_put+0xb5/0xe2 [<6026027b>] put_device+0x19/0x1c [<6026a036>] platform_device_put+0x26/0x29 [<6026ac5a>] platform_device_unregister+0x2c/0x2e [<6002c52e>] ubd_remove+0xb8/0xd6 [<6002bb74>] ? mconsole_reply+0x0/0x50 [<6002b926>] mconsole_remove+0x160/0x1cc [<6002bbbc>] ? mconsole_reply+0x48/0x50 [<6003379c>] ? um_set_signals+0x3b/0x43 [<60061c55>] ? update_min_vruntime+0x14/0x70 [<6006251f>] ? dequeue_task_fair+0x164/0x235 [<600620aa>] ? update_cfs_group+0x0/0x40 [<603a0e77>] ? __schedule+0x0/0x3ed [<60033761>] ? um_set_signals+0x0/0x43 [<6002af6a>] mc_work_proc+0x77/0x91 [<600520b4>] process_scheduled_works+0x1af/0x2c3 [<6004ede3>] ? assign_work+0x0/0x58 [<600527a1>] worker_thread+0x2f7/0x37a [<6004ee3b>] ? set_pf_worker+0x0/0x64 [<6005765d>] ? arch_local_irq_save+0x0/0x2d [<60058e07>] ? kthread_exit+0x0/0x3a [<600524aa>] ? worker_thread+0x0/0x37a [<60058f9f>] kthread+0x130/0x135 [<6002068e>] new_thread_handler+0x85/0xb6 Cc: stable@vger.kernel.org Signed-off-by: Tiwei Bie Acked-By: Anton Ivanov Link: https://patch.msgid.link/20241104163203.435515-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/ubd_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index f19173da64d8..66c1a8835e36 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -779,7 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) static void ubd_device_release(struct device *dev) { - struct ubd *ubd_dev = dev_get_drvdata(dev); + struct ubd *ubd_dev = container_of(dev, struct ubd, pdev.dev); blk_mq_free_tag_set(&ubd_dev->tag_set); *ubd_dev = ((struct ubd) DEFAULT_UBD); -- cgit From d1db692a9be3b4bd3473b64fcae996afaffe8438 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Tue, 5 Nov 2024 00:32:02 +0800 Subject: um: net: Do not use drvdata in release The drvdata is not available in release. Let's just use container_of() to get the uml_net instance. Otherwise, removing a network device will result in a crash: RIP: 0033:net_device_release+0x10/0x6f RSP: 00000000e20c7c40 EFLAGS: 00010206 RAX: 000000006002e4e7 RBX: 00000000600f1baf RCX: 00000000624074e0 RDX: 0000000062778000 RSI: 0000000060551c80 RDI: 00000000627af028 RBP: 00000000e20c7c50 R08: 00000000603ad594 R09: 00000000e20c7b70 R10: 000000000000135a R11: 00000000603ad422 R12: 0000000000000000 R13: 0000000062c7af00 R14: 0000000062406d60 R15: 00000000627700b6 Kernel panic - not syncing: Segfault with no mm CPU: 0 UID: 0 PID: 29 Comm: kworker/0:2 Not tainted 6.12.0-rc6-g59b723cd2adb #1 Workqueue: events mc_work_proc Stack: 627af028 62c7af00 e20c7c80 60276fcd 62778000 603f5820 627af028 00000000 e20c7cb0 603a2bcd 627af000 62770010 Call Trace: [<60276fcd>] device_release+0x70/0xba [<603a2bcd>] kobject_put+0xba/0xe7 [<60277265>] put_device+0x19/0x1c [<60281266>] platform_device_put+0x26/0x29 [<60281e5f>] platform_device_unregister+0x2c/0x2e [<6002ec9c>] net_remove+0x63/0x69 [<60031316>] ? mconsole_reply+0x0/0x50 [<600310c8>] mconsole_remove+0x160/0x1cc [<60087d40>] ? __remove_hrtimer+0x38/0x74 [<60087ff8>] ? hrtimer_try_to_cancel+0x8c/0x98 [<6006b3cf>] ? dl_server_stop+0x3f/0x48 [<6006b390>] ? dl_server_stop+0x0/0x48 [<600672e8>] ? dequeue_entities+0x327/0x390 [<60038fa6>] ? um_set_signals+0x0/0x43 [<6003070c>] mc_work_proc+0x77/0x91 [<60057664>] process_scheduled_works+0x1b3/0x2dd [<60055f32>] ? assign_work+0x0/0x58 [<60057f0a>] worker_thread+0x1e9/0x293 [<6005406f>] ? set_pf_worker+0x0/0x64 [<6005d65d>] ? arch_local_irq_save+0x0/0x2d [<6005d748>] ? kthread_exit+0x0/0x3a [<60057d21>] ? worker_thread+0x0/0x293 [<6005dbf1>] kthread+0x126/0x12b [<600219c5>] new_thread_handler+0x85/0xb6 Cc: stable@vger.kernel.org Signed-off-by: Tiwei Bie Acked-By: Anton Ivanov Link: https://patch.msgid.link/20241104163203.435515-4-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/net_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 77c4afb8ab90..75d04fb4994a 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -336,7 +336,7 @@ static struct platform_driver uml_net_driver = { static void net_device_release(struct device *dev) { - struct uml_net *device = dev_get_drvdata(dev); + struct uml_net *device = container_of(dev, struct uml_net, pdev.dev); struct net_device *netdev = device->dev; struct uml_net_private *lp = netdev_priv(netdev); -- cgit From 51b39d741970742a5c41136241a9c48ac607cf82 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Tue, 5 Nov 2024 00:32:03 +0800 Subject: um: vector: Do not use drvdata in release The drvdata is not available in release. Let's just use container_of() to get the vector_device instance. Otherwise, removing a vector device will result in a crash: RIP: 0033:vector_device_release+0xf/0x50 RSP: 00000000e187bc40 EFLAGS: 00010202 RAX: 0000000060028f61 RBX: 00000000600f1baf RCX: 00000000620074e0 RDX: 000000006220b9c0 RSI: 0000000060551c80 RDI: 0000000000000000 RBP: 00000000e187bc50 R08: 00000000603ad594 R09: 00000000e187bb70 R10: 000000000000135a R11: 00000000603ad422 R12: 00000000623ae028 R13: 000000006287a200 R14: 0000000062006d30 R15: 00000000623700b6 Kernel panic - not syncing: Segfault with no mm CPU: 0 UID: 0 PID: 16 Comm: kworker/0:1 Not tainted 6.12.0-rc6-g59b723cd2adb #1 Workqueue: events mc_work_proc Stack: 60028f61 623ae028 e187bc80 60276fcd 6220b9c0 603f5820 623ae028 00000000 e187bcb0 603a2bcd 623ae000 62370010 Call Trace: [<60028f61>] ? vector_device_release+0x0/0x50 [<60276fcd>] device_release+0x70/0xba [<603a2bcd>] kobject_put+0xba/0xe7 [<60277265>] put_device+0x19/0x1c [<60281266>] platform_device_put+0x26/0x29 [<60281e5f>] platform_device_unregister+0x2c/0x2e [<60029422>] vector_remove+0x52/0x58 [<60031316>] ? mconsole_reply+0x0/0x50 [<600310c8>] mconsole_remove+0x160/0x1cc [<603b19f4>] ? strlen+0x0/0x15 [<60066611>] ? __dequeue_entity+0x1a9/0x206 [<600666a7>] ? set_next_entity+0x39/0x63 [<6006666e>] ? set_next_entity+0x0/0x63 [<60038fa6>] ? um_set_signals+0x0/0x43 [<6003070c>] mc_work_proc+0x77/0x91 [<60057664>] process_scheduled_works+0x1b3/0x2dd [<60055f32>] ? assign_work+0x0/0x58 [<60057f0a>] worker_thread+0x1e9/0x293 [<6005406f>] ? set_pf_worker+0x0/0x64 [<6005d65d>] ? arch_local_irq_save+0x0/0x2d [<6005d748>] ? kthread_exit+0x0/0x3a [<60057d21>] ? worker_thread+0x0/0x293 [<6005dbf1>] kthread+0x126/0x12b [<600219c5>] new_thread_handler+0x85/0xb6 Cc: stable@vger.kernel.org Signed-off-by: Tiwei Bie Acked-By: Anton Ivanov Link: https://patch.msgid.link/20241104163203.435515-5-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vector_kern.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index c992da83268d..64c09db392c1 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -815,7 +815,8 @@ static struct platform_driver uml_net_driver = { static void vector_device_release(struct device *dev) { - struct vector_device *device = dev_get_drvdata(dev); + struct vector_device *device = + container_of(dev, struct vector_device, pdev.dev); struct net_device *netdev = device->dev; list_del(&device->list); -- cgit From 0f659ff362eac69777c4c191b7e5ccb19d76c67d Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Wed, 6 Nov 2024 18:39:33 +0800 Subject: um: Always dump trace for specified task in show_stack Currently, show_stack() always dumps the trace of the current task. However, it should dump the trace of the specified task if one is provided. Otherwise, things like running "echo t > sysrq-trigger" won't work as expected. Fixes: 970e51feaddb ("um: Add support for CONFIG_STACKTRACE") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20241106103933.1132365-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/sysrq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index a3bdab048617..13ee5666668d 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -46,5 +46,5 @@ void show_stack(struct task_struct *task, unsigned long *stack, } printk("%sCall Trace:\n", loglvl); - dump_trace(current, &stackops, (void *)loglvl); + dump_trace(task ?: current, &stackops, (void *)loglvl); } -- cgit From 2f681ba4b352cdd5658ed2a96062375a12839755 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 11 Nov 2024 11:29:10 +0100 Subject: um: move thread info into task This selects the THREAD_INFO_IN_TASK option for UM and changes the way that the current task is discovered. This is trivial though, as UML already tracks the current task in cpu_tasks[] and this can be used to retrieve it. Also remove the signal handler code that copies the thread information into the IRQ stack. It is obsolete now, which also means that the mentioned race condition cannot happen anymore. Signed-off-by: Benjamin Berg Reviewed-by: Hajime Tazaki Link: https://patch.msgid.link/20241111102910.46512-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/Kconfig | 1 + arch/um/include/asm/Kbuild | 1 - arch/um/include/asm/current.h | 23 ++++++++ arch/um/include/asm/thread_info.h | 16 ------ arch/um/include/shared/as-layout.h | 7 +-- arch/um/kernel/dyn.lds.S | 2 - arch/um/kernel/irq.c | 112 ------------------------------------- arch/um/kernel/process.c | 5 +- arch/um/kernel/skas/process.c | 4 +- arch/um/kernel/um_arch.c | 7 +-- arch/um/kernel/uml.lds.S | 2 - arch/um/os-Linux/signal.c | 37 +----------- 12 files changed, 34 insertions(+), 183 deletions(-) create mode 100644 arch/um/include/asm/current.h diff --git a/arch/um/Kconfig b/arch/um/Kconfig index a9876bdb5bf9..18051b1cfce0 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -34,6 +34,7 @@ config UML select HAVE_RUST select ARCH_HAS_UBSAN select HAVE_ARCH_TRACEHOOK + select THREAD_INFO_IN_TASK config MMU bool diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 18f902da8e99..428f2c5158c2 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += bug.h generic-y += compat.h -generic-y += current.h generic-y += device.h generic-y += dma-mapping.h generic-y += emergency-restart.h diff --git a/arch/um/include/asm/current.h b/arch/um/include/asm/current.h new file mode 100644 index 000000000000..de64e032d66c --- /dev/null +++ b/arch/um/include/asm/current.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_CURRENT_H +#define __ASM_CURRENT_H + +#include +#include + +#ifndef __ASSEMBLY__ + +struct task_struct; +extern struct task_struct *cpu_tasks[NR_CPUS]; + +static __always_inline struct task_struct *get_current(void) +{ + return cpu_tasks[0]; +} + + +#define current get_current() + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_CURRENT_H */ diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 4d2a768246bc..f9ad06fcc991 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -17,33 +17,17 @@ #include struct thread_info { - struct task_struct *task; /* main task structure */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - struct thread_info *real_thread; /* Points to non-IRQ stack */ }; #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .real_thread = NULL, \ -} - -/* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - struct thread_info *ti; - unsigned long mask = THREAD_SIZE - 1; - void *p; - - asm volatile ("" : "=r" (p) : "0" (&ti)); - ti = (struct thread_info *) (((unsigned long)p) & ~mask); - return ti; } #endif diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h index d9679c911e54..ea65f151bf48 100644 --- a/arch/um/include/shared/as-layout.h +++ b/arch/um/include/shared/as-layout.h @@ -30,11 +30,8 @@ #include -struct cpu_task { - void *task; -}; - -extern struct cpu_task cpu_tasks[]; +struct task_struct; +extern struct task_struct *cpu_tasks[]; extern unsigned long long physmem_size; diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index dc9d9a68af55..a36b7918a011 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -116,8 +116,6 @@ SECTIONS .fini_array : { *(.fini_array) } .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.data.* .gnu.linkonce.d.*) SORT(CONSTRUCTORS) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 534e91797f89..338450741aac 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -674,115 +674,3 @@ void __init init_IRQ(void) /* Initialize EPOLL Loop */ os_setup_epoll(); } - -/* - * IRQ stack entry and exit: - * - * Unlike i386, UML doesn't receive IRQs on the normal kernel stack - * and switch over to the IRQ stack after some preparation. We use - * sigaltstack to receive signals on a separate stack from the start. - * These two functions make sure the rest of the kernel won't be too - * upset by being on a different stack. The IRQ stack has a - * thread_info structure at the bottom so that current et al continue - * to work. - * - * to_irq_stack copies the current task's thread_info to the IRQ stack - * thread_info and sets the tasks's stack to point to the IRQ stack. - * - * from_irq_stack copies the thread_info struct back (flags may have - * been modified) and resets the task's stack pointer. - * - * Tricky bits - - * - * What happens when two signals race each other? UML doesn't block - * signals with sigprocmask, SA_DEFER, or sa_mask, so a second signal - * could arrive while a previous one is still setting up the - * thread_info. - * - * There are three cases - - * The first interrupt on the stack - sets up the thread_info and - * handles the interrupt - * A nested interrupt interrupting the copying of the thread_info - - * can't handle the interrupt, as the stack is in an unknown state - * A nested interrupt not interrupting the copying of the - * thread_info - doesn't do any setup, just handles the interrupt - * - * The first job is to figure out whether we interrupted stack setup. - * This is done by xchging the signal mask with thread_info->pending. - * If the value that comes back is zero, then there is no setup in - * progress, and the interrupt can be handled. If the value is - * non-zero, then there is stack setup in progress. In order to have - * the interrupt handled, we leave our signal in the mask, and it will - * be handled by the upper handler after it has set up the stack. - * - * Next is to figure out whether we are the outer handler or a nested - * one. As part of setting up the stack, thread_info->real_thread is - * set to non-NULL (and is reset to NULL on exit). This is the - * nesting indicator. If it is non-NULL, then the stack is already - * set up and the handler can run. - */ - -static unsigned long pending_mask; - -unsigned long to_irq_stack(unsigned long *mask_out) -{ - struct thread_info *ti; - unsigned long mask, old; - int nested; - - mask = xchg(&pending_mask, *mask_out); - if (mask != 0) { - /* - * If any interrupts come in at this point, we want to - * make sure that their bits aren't lost by our - * putting our bit in. So, this loop accumulates bits - * until xchg returns the same value that we put in. - * When that happens, there were no new interrupts, - * and pending_mask contains a bit for each interrupt - * that came in. - */ - old = *mask_out; - do { - old |= mask; - mask = xchg(&pending_mask, old); - } while (mask != old); - return 1; - } - - ti = current_thread_info(); - nested = (ti->real_thread != NULL); - if (!nested) { - struct task_struct *task; - struct thread_info *tti; - - task = cpu_tasks[ti->cpu].task; - tti = task_thread_info(task); - - *ti = *tti; - ti->real_thread = tti; - task->stack = ti; - } - - mask = xchg(&pending_mask, 0); - *mask_out |= mask | nested; - return 0; -} - -unsigned long from_irq_stack(int nested) -{ - struct thread_info *ti, *to; - unsigned long mask; - - ti = current_thread_info(); - - pending_mask = 1; - - to = ti->real_thread; - current->stack = to; - ti->real_thread = NULL; - *to = *ti; - - mask = xchg(&pending_mask, 0); - return mask & ~1; -} - diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 56e7e525fc91..30bdc0a87dc8 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -43,7 +43,8 @@ * cares about its entry, so it's OK if another processor is modifying its * entry. */ -struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { NULL } }; +struct task_struct *cpu_tasks[NR_CPUS]; +EXPORT_SYMBOL(cpu_tasks); void free_stack(unsigned long stack, int order) { @@ -64,7 +65,7 @@ unsigned long alloc_stack(int order, int atomic) static inline void set_current(struct task_struct *task) { - cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task) { task }); + cpu_tasks[task_thread_info(task)->cpu] = task; } struct task_struct *__switch_to(struct task_struct *from, struct task_struct *to) diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index 68657988c8d1..05dcdc057af9 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -22,15 +22,13 @@ static int __init start_kernel_proc(void *unused) { block_signals_trace(); - cpu_tasks[0].task = current; - start_kernel(); return 0; } extern int userspace_pid[]; -extern char cpu0_irqstack[]; +static char cpu0_irqstack[THREAD_SIZE] __aligned(THREAD_SIZE); int __init start_uml(void) { diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index ec17576ce9fc..62ddb865eb91 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -65,9 +65,6 @@ struct cpuinfo_um boot_cpu_data = { EXPORT_SYMBOL(boot_cpu_data); -union thread_union cpu0_irqstack - __section(".data..init_irqstack") = - { .thread_info = INIT_THREAD_INFO(init_task) }; /* Changed in setup_arch, which is called in early boot */ static char host_info[(__NEW_UTS_LEN + 1) * 5]; @@ -244,6 +241,8 @@ static struct notifier_block panic_exit_notifier = { void uml_finishsetup(void) { + cpu_tasks[0] = &init_task; + atomic_notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); @@ -418,7 +417,7 @@ void __init setup_arch(char **cmdline_p) { u8 rng_seed[32]; - stack_protections((unsigned long) &init_thread_info); + stack_protections((unsigned long) init_task.stack); setup_physmem(uml_physmem, uml_reserved, physmem_size); mem_total_pages(physmem_size, iomem_size); uml_dtb_init(); diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 5c92d58a78e8..a409d4b66114 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -77,8 +77,6 @@ SECTIONS .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.gnu.linkonce.d*) CONSTRUCTORS diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 52852018a3ad..9ea7269ffb77 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -190,43 +190,8 @@ static void hard_handler(int sig, siginfo_t *si, void *p) { ucontext_t *uc = p; mcontext_t *mc = &uc->uc_mcontext; - unsigned long pending = 1UL << sig; - do { - int nested, bail; - - /* - * pending comes back with one bit set for each - * interrupt that arrived while setting up the stack, - * plus a bit for this interrupt, plus the zero bit is - * set if this is a nested interrupt. - * If bail is true, then we interrupted another - * handler setting up the stack. In this case, we - * have to return, and the upper handler will deal - * with this interrupt. - */ - bail = to_irq_stack(&pending); - if (bail) - return; - - nested = pending & 1; - pending &= ~1; - - while ((sig = ffs(pending)) != 0){ - sig--; - pending &= ~(1 << sig); - (*handlers[sig])(sig, (struct siginfo *)si, mc); - } - - /* - * Again, pending comes back with a mask of signals - * that arrived while tearing down the stack. If this - * is non-zero, we just go back, set up the stack - * again, and handle the new interrupts. - */ - if (!nested) - pending = from_irq_stack(nested); - } while (pending); + (*handlers[sig])(sig, (struct siginfo *)si, mc); } void set_handler(int sig) -- cgit From bed2cc482600296fe04edbc38005ba2851449c10 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Mon, 4 Nov 2024 20:34:40 +0800 Subject: hostfs: Fix the NULL vs IS_ERR() bug for __filemap_get_folio() The __filemap_get_folio() function returns error pointers. It never returns NULL. So use IS_ERR() to check it. Fixes: 1da86618bdce ("fs: Convert aops->write_begin to take a folio") Signed-off-by: ZhangPeng Acked-by: Johannes Berg Signed-off-by: Richard Weinberger --- fs/hostfs/hostfs_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 8d47c6b70c9f..7e51d2cec64b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -472,8 +472,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping, *foliop = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); - if (!*foliop) - return -ENOMEM; + if (IS_ERR(*foliop)) + return PTR_ERR(*foliop); return 0; } -- cgit