diff options
86 files changed, 2602 insertions, 1529 deletions
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index bdd344aa18d9..18c5feef2577 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -113,7 +113,15 @@ header, is usually reserved at an offset greater than boot memory size (see Fig. 1). This area is *not* released: this region will be kept permanently reserved, so that it can act as a receptacle for a copy of the boot memory content in addition to CPU state -and HPTE region, in the case a crash does occur. +and HPTE region, in the case a crash does occur. Since this reserved +memory area is used only after the system crash, there is no point in +blocking this significant chunk of memory from production kernel. +Hence, the implementation uses the Linux kernel's Contiguous Memory +Allocator (CMA) for memory reservation if CMA is configured for kernel. +With CMA reservation this memory will be available for applications to +use it, while kernel is prevented from using it. With this fadump will +still be able to capture all of the kernel memory and most of the user +space memory except the user pages that were present in CMA region. o Memory Reservation during first kernel @@ -162,6 +170,9 @@ How to enable firmware-assisted dump (fadump): 1. Set config option CONFIG_FA_DUMP=y and build kernel. 2. Boot into linux kernel with 'fadump=on' kernel cmdline option. + By default, fadump reserved memory will be initialized as CMA area. + Alternatively, user can boot linux kernel with 'fadump=nocma' to + prevent fadump to use CMA. 3. Optionally, user can also set 'crashkernel=' kernel cmdline to specify size of the memory to reserve for boot memory dump preservation. @@ -172,6 +183,10 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead 2. If firmware-assisted dump fails to reserve memory then it will fallback to existing kdump mechanism if 'crashkernel=' option is set at kernel cmdline. + 3. if user wants to capture all of user space memory and ok with + reserved memory not available to production system, then + 'fadump=nocma' kernel parameter can be used to fallback to + old behaviour. Sysfs/debugfs files: ------------ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6d6e1ffdafba..a1e858e42ada 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1143,7 +1143,7 @@ config PIN_TLB_DATA config PIN_TLB_IMMR bool "Pinned TLB for IMMR" - depends on PIN_TLB + depends on PIN_TLB || PPC_EARLY_DEBUG_CPM default y config PIN_TLB_TEXT diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index fa28e844b5b4..4c4e0eb16e2c 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -417,6 +417,9 @@ archclean: archprepare: checkbin +archheaders: + $(Q)$(MAKE) $(build)=arch/powerpc/kernel/syscalls all + ifdef CONFIG_STACKPROTECTOR prepare: stack_protector_prepare diff --git a/arch/powerpc/boot/serial.c b/arch/powerpc/boot/serial.c index f045f8494bf9..b0491b8c0199 100644 --- a/arch/powerpc/boot/serial.c +++ b/arch/powerpc/boot/serial.c @@ -93,7 +93,8 @@ static void *serial_get_stdout_devp(void) if (devp == NULL) goto err_out; - if (getprop(devp, "linux,stdout-path", path, MAX_PATH_LEN) > 0) { + if (getprop(devp, "linux,stdout-path", path, MAX_PATH_LEN) > 0 || + getprop(devp, "stdout-path", path, MAX_PATH_LEN) > 0) { devp = finddevice(path); if (devp == NULL) goto err_out; diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 3196d227e351..77ff7fb24823 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -1,3 +1,7 @@ +generated-y += syscall_table_32.h +generated-y += syscall_table_64.h +generated-y += syscall_table_c32.h +generated-y += syscall_table_spu.h generic-y += div64.h generic-y += export.h generic-y += irq_regs.h diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 15bc16b1dc9c..cf5ba5254299 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -1,11 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_POWERPC_BOOK3S_64_HASH_4K_H #define _ASM_POWERPC_BOOK3S_64_HASH_4K_H -/* - * Entries per page directory level. The PTE level must use a 64b record - * for each page table entry. The PMD and PGD level use a 32b record for - * each entry by assuming that each entry is page aligned. - */ + #define H_PTE_INDEX_SIZE 9 #define H_PMD_INDEX_SIZE 7 #define H_PUD_INDEX_SIZE 9 diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 1e7a33592e29..188776befaf9 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -48,6 +48,10 @@ #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) +/* Alignement per CMA requirement. */ +#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ + max_t(unsigned long, MAX_ORDER - 1, pageblock_order)) + /* Firmware provided dump sections */ #define FADUMP_CPU_STATE_DATA 0x0001 #define FADUMP_HPTE_REGION 0x0002 @@ -141,6 +145,7 @@ struct fw_dump { unsigned long fadump_supported:1; unsigned long dump_active:1; unsigned long dump_registered:1; + unsigned long nocma:1; }; /* @@ -200,7 +205,7 @@ struct fad_crash_memory_ranges { unsigned long long size; }; -extern int is_fadump_boot_memory_area(u64 addr, ulong size); +extern int is_fadump_memory_area(u64 addr, ulong size); extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); extern int fadump_reserve_mem(void); diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 35db0cbc9222..e847ff69cb2b 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -215,11 +215,12 @@ struct iommu_table_group { extern void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num); -extern int iommu_add_device(struct device *dev); +extern int iommu_add_device(struct iommu_table_group *table_group, + struct device *dev); extern void iommu_del_device(struct device *dev); -extern int __init tce_iommu_bus_notifier_init(void); -extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, - unsigned long *hpa, enum dma_data_direction *direction); +extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction); #else static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, @@ -227,7 +228,8 @@ static inline void iommu_register_group(struct iommu_table_group *table_group, { } -static inline int iommu_add_device(struct device *dev) +static inline int iommu_add_device(struct iommu_table_group *table_group, + struct device *dev) { return 0; } @@ -235,11 +237,6 @@ static inline int iommu_add_device(struct device *dev) static inline void iommu_del_device(struct device *dev) { } - -static inline int __init tce_iommu_bus_notifier_init(void) -{ - return 0; -} #endif /* !CONFIG_IOMMU_API */ int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr); diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index c05efd2e8736..6ee8195a2ffb 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -21,9 +21,12 @@ struct mm_iommu_table_group_mem_t; extern int isolate_lru_page(struct page *page); /* from internal.h */ extern bool mm_iommu_preregistered(struct mm_struct *mm); -extern long mm_iommu_get(struct mm_struct *mm, +extern long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, struct mm_iommu_table_group_mem_t **pmem); +extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem); extern long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_init(struct mm_struct *mm); @@ -32,15 +35,23 @@ extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, unsigned long ua, unsigned long size); extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm( struct mm_struct *mm, unsigned long ua, unsigned long size); -extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, +extern struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries); extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); +extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, + unsigned int pageshift, unsigned long *size); extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); +#else +static inline bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, + unsigned int pageshift, unsigned long *size) +{ + return false; +} #endif extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); @@ -217,12 +228,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, #endif } -static inline int arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ - return 0; -} - #ifdef CONFIG_PPC_BOOK3E_64 static inline void arch_exit_mmap(struct mm_struct *mm) { @@ -247,6 +252,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, #ifdef CONFIG_PPC_MEM_KEYS bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign); +void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm); #else /* CONFIG_PPC_MEM_KEYS */ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) @@ -259,6 +265,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, #define thread_pkey_regs_save(thread) #define thread_pkey_regs_restore(new_thread, old_thread) #define thread_pkey_regs_init(thread) +#define arch_dup_pkeys(oldmm, mm) static inline u64 pte_to_hpte_pkey_bits(u64 pteflags) { @@ -267,5 +274,12 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags) #endif /* CONFIG_PPC_MEM_KEYS */ +static inline int arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + arch_dup_pkeys(oldmm, mm); + return 0; +} + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index ff3866473afe..a55b01c90bb1 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -347,6 +347,7 @@ extern int opal_async_comp_init(void); extern int opal_sensor_init(void); extern int opal_hmi_handler_init(void); extern int opal_event_init(void); +int opal_power_control_init(void); extern int opal_machine_check(struct pt_regs *regs); extern bool opal_mce_check_early_recovery(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 94d449031b18..aee4fcc24990 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -129,6 +129,7 @@ struct pci_controller { #endif /* CONFIG_PPC64 */ void *private_data; + struct npu *npu; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 2af9ded80540..0c72f1897063 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -129,5 +129,9 @@ extern void pcibios_scan_phb(struct pci_controller *hose); extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev); extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index); +extern int pnv_npu2_init(struct pci_controller *hose); +extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, + unsigned long msr); +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev); #endif /* __ASM_POWERPC_PCI_H */ diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index ab9f3f0a8637..1a0e7a8b1c81 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -18,9 +18,8 @@ #include <linux/thread_info.h> /* ftrace syscalls requires exporting the sys_call_table */ -#ifdef CONFIG_FTRACE_SYSCALLS extern const unsigned long sys_call_table[]; -#endif /* CONFIG_FTRACE_SYSCALLS */ +extern const unsigned long compat_sys_call_table[]; static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h deleted file mode 100644 index 01b5171ea189..000000000000 --- a/arch/powerpc/include/asm/systbl.h +++ /dev/null @@ -1,396 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * List of powerpc syscalls. For the meaning of the _SPU suffix see - * arch/powerpc/platforms/cell/spu_callbacks.c - */ - -SYSCALL(restart_syscall) -SYSCALL(exit) -PPC_SYS(fork) -SYSCALL_SPU(read) -SYSCALL_SPU(write) -COMPAT_SYS_SPU(open) -SYSCALL_SPU(close) -SYSCALL_SPU(waitpid) -SYSCALL_SPU(creat) -SYSCALL_SPU(link) -SYSCALL_SPU(unlink) -COMPAT_SYS(execve) -SYSCALL_SPU(chdir) -COMPAT_SYS_SPU(time) -SYSCALL_SPU(mknod) -SYSCALL_SPU(chmod) -SYSCALL_SPU(lchown) -SYSCALL(ni_syscall) -OLDSYS(stat) -COMPAT_SYS_SPU(lseek) -SYSCALL_SPU(getpid) -COMPAT_SYS(mount) -SYSX(sys_ni_syscall,sys_oldumount,sys_oldumount) -SYSCALL_SPU(setuid) -SYSCALL_SPU(getuid) -COMPAT_SYS_SPU(stime) -COMPAT_SYS(ptrace) -SYSCALL_SPU(alarm) -OLDSYS(fstat) -SYSCALL(pause) -COMPAT_SYS(utime) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL_SPU(access) -SYSCALL_SPU(nice) -SYSCALL(ni_syscall) -SYSCALL_SPU(sync) -SYSCALL_SPU(kill) -SYSCALL_SPU(rename) -SYSCALL_SPU(mkdir) -SYSCALL_SPU(rmdir) -SYSCALL_SPU(dup) -SYSCALL_SPU(pipe) -COMPAT_SYS_SPU(times) -SYSCALL(ni_syscall) -SYSCALL_SPU(brk) -SYSCALL_SPU(setgid) -SYSCALL_SPU(getgid) -SYSCALL(signal) -SYSCALL_SPU(geteuid) -SYSCALL_SPU(getegid) -SYSCALL(acct) -SYSCALL(umount) -SYSCALL(ni_syscall) -COMPAT_SYS_SPU(ioctl) -COMPAT_SYS_SPU(fcntl) -SYSCALL(ni_syscall) -SYSCALL_SPU(setpgid) -SYSCALL(ni_syscall) -SYSX(sys_ni_syscall,sys_olduname,sys_olduname) -SYSCALL_SPU(umask) -SYSCALL_SPU(chroot) -COMPAT_SYS(ustat) -SYSCALL_SPU(dup2) -SYSCALL_SPU(getppid) -SYSCALL_SPU(getpgrp) -SYSCALL_SPU(setsid) -SYS32ONLY(sigaction) -SYSCALL_SPU(sgetmask) -SYSCALL_SPU(ssetmask) -SYSCALL_SPU(setreuid) -SYSCALL_SPU(setregid) -#define compat_sys_sigsuspend sys_sigsuspend -SYS32ONLY(sigsuspend) -SYSX(sys_ni_syscall,compat_sys_sigpending,sys_sigpending) -SYSCALL_SPU(sethostname) -COMPAT_SYS_SPU(setrlimit) -SYSX(sys_ni_syscall,compat_sys_old_getrlimit,sys_old_getrlimit) -COMPAT_SYS_SPU(getrusage) -COMPAT_SYS_SPU(gettimeofday) -COMPAT_SYS_SPU(settimeofday) -SYSCALL_SPU(getgroups) -SYSCALL_SPU(setgroups) -SYSX(sys_ni_syscall,sys_ni_syscall,ppc_select) -SYSCALL_SPU(symlink) -OLDSYS(lstat) -SYSCALL_SPU(readlink) -SYSCALL(uselib) -SYSCALL(swapon) -SYSCALL(reboot) -SYSX(sys_ni_syscall,compat_sys_old_readdir,sys_old_readdir) -SYSCALL_SPU(mmap) -SYSCALL_SPU(munmap) -COMPAT_SYS_SPU(truncate) -COMPAT_SYS_SPU(ftruncate) -SYSCALL_SPU(fchmod) -SYSCALL_SPU(fchown) -SYSCALL_SPU(getpriority) -SYSCALL_SPU(setpriority) -SYSCALL(ni_syscall) -COMPAT_SYS(statfs) -COMPAT_SYS(fstatfs) -SYSCALL(ni_syscall) -COMPAT_SYS_SPU(socketcall) -SYSCALL_SPU(syslog) -COMPAT_SYS_SPU(setitimer) -COMPAT_SYS_SPU(getitimer) -COMPAT_SYS_SPU(newstat) -COMPAT_SYS_SPU(newlstat) -COMPAT_SYS_SPU(newfstat) -SYSX(sys_ni_syscall,sys_uname,sys_uname) -SYSCALL(ni_syscall) -SYSCALL_SPU(vhangup) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -COMPAT_SYS_SPU(wait4) -SYSCALL(swapoff) -COMPAT_SYS_SPU(sysinfo) -COMPAT_SYS(ipc) -SYSCALL_SPU(fsync) -SYS32ONLY(sigreturn) -PPC_SYS(clone) -SYSCALL_SPU(setdomainname) -SYSCALL_SPU(newuname) -SYSCALL(ni_syscall) -COMPAT_SYS_SPU(adjtimex) -SYSCALL_SPU(mprotect) -SYSX(sys_ni_syscall,compat_sys_sigprocmask,sys_sigprocmask) -SYSCALL(ni_syscall) -SYSCALL(init_module) -SYSCALL(delete_module) -SYSCALL(ni_syscall) -SYSCALL(quotactl) -SYSCALL_SPU(getpgid) -SYSCALL_SPU(fchdir) -SYSCALL_SPU(bdflush) -SYSCALL_SPU(sysfs) -SYSX_SPU(ppc64_personality,ppc64_personality,sys_personality) -SYSCALL(ni_syscall) -SYSCALL_SPU(setfsuid) -SYSCALL_SPU(setfsgid) -SYSCALL_SPU(llseek) -COMPAT_SYS_SPU(getdents) -COMPAT_SPU_NEW(select) -SYSCALL_SPU(flock) -SYSCALL_SPU(msync) -COMPAT_SYS_SPU(readv) -COMPAT_SYS_SPU(writev) -SYSCALL_SPU(getsid) -SYSCALL_SPU(fdatasync) -COMPAT_SYS(sysctl) -SYSCALL_SPU(mlock) -SYSCALL_SPU(munlock) -SYSCALL_SPU(mlockall) -SYSCALL_SPU(munlockall) -SYSCALL_SPU(sched_setparam) -SYSCALL_SPU(sched_getparam) -SYSCALL_SPU(sched_setscheduler) -SYSCALL_SPU(sched_getscheduler) -SYSCALL_SPU(sched_yield) -SYSCALL_SPU(sched_get_priority_max) -SYSCALL_SPU(sched_get_priority_min) -COMPAT_SYS_SPU(sched_rr_get_interval) -COMPAT_SYS_SPU(nanosleep) -SYSCALL_SPU(mremap) -SYSCALL_SPU(setresuid) -SYSCALL_SPU(getresuid) -SYSCALL(ni_syscall) -SYSCALL_SPU(poll) -SYSCALL(ni_syscall) -SYSCALL_SPU(setresgid) -SYSCALL_SPU(getresgid) -SYSCALL_SPU(prctl) -COMPAT_SYS(rt_sigreturn) -COMPAT_SYS(rt_sigaction) -COMPAT_SYS(rt_sigprocmask) -COMPAT_SYS(rt_sigpending) -COMPAT_SYS(rt_sigtimedwait) -COMPAT_SYS(rt_sigqueueinfo) -COMPAT_SYS(rt_sigsuspend) -COMPAT_SYS_SPU(pread64) -COMPAT_SYS_SPU(pwrite64) -SYSCALL_SPU(chown) -SYSCALL_SPU(getcwd) -SYSCALL_SPU(capget) -SYSCALL_SPU(capset) -COMPAT_SYS(sigaltstack) -SYSX_SPU(sys_sendfile64,compat_sys_sendfile,sys_sendfile) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -PPC_SYS(vfork) -COMPAT_SYS_SPU(getrlimit) -COMPAT_SYS_SPU(readahead) -SYS32ONLY(mmap2) -SYS32ONLY(truncate64) -SYS32ONLY(ftruncate64) -SYSX(sys_ni_syscall,sys_stat64,sys_stat64) -SYSX(sys_ni_syscall,sys_lstat64,sys_lstat64) -SYSX(sys_ni_syscall,sys_fstat64,sys_fstat64) -SYSCALL(pciconfig_read) -SYSCALL(pciconfig_write) -SYSCALL(pciconfig_iobase) -SYSCALL(ni_syscall) -SYSCALL_SPU(getdents64) -SYSCALL_SPU(pivot_root) -SYSX(sys_ni_syscall,compat_sys_fcntl64,sys_fcntl64) -SYSCALL_SPU(madvise) -SYSCALL_SPU(mincore) -SYSCALL_SPU(gettid) -SYSCALL_SPU(tkill) -SYSCALL_SPU(setxattr) -SYSCALL_SPU(lsetxattr) -SYSCALL_SPU(fsetxattr) -SYSCALL_SPU(getxattr) -SYSCALL_SPU(lgetxattr) -SYSCALL_SPU(fgetxattr) -SYSCALL_SPU(listxattr) -SYSCALL_SPU(llistxattr) -SYSCALL_SPU(flistxattr) -SYSCALL_SPU(removexattr) -SYSCALL_SPU(lremovexattr) -SYSCALL_SPU(fremovexattr) -COMPAT_SYS_SPU(futex) -COMPAT_SYS_SPU(sched_setaffinity) -COMPAT_SYS_SPU(sched_getaffinity) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYS32ONLY(sendfile64) -COMPAT_SYS_SPU(io_setup) -SYSCALL_SPU(io_destroy) -COMPAT_SYS_SPU(io_getevents) -COMPAT_SYS_SPU(io_submit) -SYSCALL_SPU(io_cancel) -SYSCALL(set_tid_address) -SYSX_SPU(sys_fadvise64,ppc32_fadvise64,sys_fadvise64) -SYSCALL(exit_group) -COMPAT_SYS(lookup_dcookie) -SYSCALL_SPU(epoll_create) -SYSCALL_SPU(epoll_ctl) -SYSCALL_SPU(epoll_wait) -SYSCALL_SPU(remap_file_pages) -COMPAT_SYS_SPU(timer_create) -COMPAT_SYS_SPU(timer_settime) -COMPAT_SYS_SPU(timer_gettime) -SYSCALL_SPU(timer_getoverrun) -SYSCALL_SPU(timer_delete) -COMPAT_SYS_SPU(clock_settime) -COMPAT_SYS_SPU(clock_gettime) -COMPAT_SYS_SPU(clock_getres) -COMPAT_SYS_SPU(clock_nanosleep) -SYSX(ppc64_swapcontext,ppc32_swapcontext,ppc_swapcontext) -SYSCALL_SPU(tgkill) -COMPAT_SYS_SPU(utimes) -COMPAT_SYS_SPU(statfs64) -COMPAT_SYS_SPU(fstatfs64) -SYSX(sys_ni_syscall,ppc_fadvise64_64,ppc_fadvise64_64) -SYSCALL_SPU(rtas) -OLDSYS(debug_setcontext) -SYSCALL(ni_syscall) -COMPAT_SYS(migrate_pages) -COMPAT_SYS(mbind) -COMPAT_SYS(get_mempolicy) -COMPAT_SYS(set_mempolicy) -COMPAT_SYS(mq_open) -SYSCALL(mq_unlink) -COMPAT_SYS(mq_timedsend) -COMPAT_SYS(mq_timedreceive) -COMPAT_SYS(mq_notify) -COMPAT_SYS(mq_getsetattr) -COMPAT_SYS(kexec_load) -SYSCALL(add_key) -SYSCALL(request_key) -COMPAT_SYS(keyctl) -COMPAT_SYS(waitid) -SYSCALL(ioprio_set) -SYSCALL(ioprio_get) -SYSCALL(inotify_init) -SYSCALL(inotify_add_watch) -SYSCALL(inotify_rm_watch) -SYSCALL(spu_run) -SYSCALL(spu_create) -COMPAT_SYS(pselect6) -COMPAT_SYS(ppoll) -SYSCALL_SPU(unshare) -SYSCALL_SPU(splice) -SYSCALL_SPU(tee) -COMPAT_SYS_SPU(vmsplice) -COMPAT_SYS_SPU(openat) -SYSCALL_SPU(mkdirat) -SYSCALL_SPU(mknodat) -SYSCALL_SPU(fchownat) -COMPAT_SYS_SPU(futimesat) -SYSX_SPU(sys_newfstatat,sys_fstatat64,sys_fstatat64) -SYSCALL_SPU(unlinkat) -SYSCALL_SPU(renameat) -SYSCALL_SPU(linkat) -SYSCALL_SPU(symlinkat) -SYSCALL_SPU(readlinkat) -SYSCALL_SPU(fchmodat) -SYSCALL_SPU(faccessat) -COMPAT_SYS_SPU(get_robust_list) -COMPAT_SYS_SPU(set_robust_list) -COMPAT_SYS_SPU(move_pages) -SYSCALL_SPU(getcpu) -COMPAT_SYS(epoll_pwait) -COMPAT_SYS_SPU(utimensat) -COMPAT_SYS_SPU(signalfd) -SYSCALL_SPU(timerfd_create) -SYSCALL_SPU(eventfd) -COMPAT_SYS_SPU(sync_file_range2) -COMPAT_SYS(fallocate) -SYSCALL(subpage_prot) -COMPAT_SYS_SPU(timerfd_settime) -COMPAT_SYS_SPU(timerfd_gettime) -COMPAT_SYS_SPU(signalfd4) -SYSCALL_SPU(eventfd2) -SYSCALL_SPU(epoll_create1) -SYSCALL_SPU(dup3) -SYSCALL_SPU(pipe2) -SYSCALL(inotify_init1) -SYSCALL_SPU(perf_event_open) -COMPAT_SYS_SPU(preadv) -COMPAT_SYS_SPU(pwritev) -COMPAT_SYS(rt_tgsigqueueinfo) -SYSCALL(fanotify_init) -COMPAT_SYS(fanotify_mark) -SYSCALL_SPU(prlimit64) -SYSCALL_SPU(socket) -SYSCALL_SPU(bind) -SYSCALL_SPU(connect) -SYSCALL_SPU(listen) -SYSCALL_SPU(accept) -SYSCALL_SPU(getsockname) -SYSCALL_SPU(getpeername) -SYSCALL_SPU(socketpair) -SYSCALL_SPU(send) -SYSCALL_SPU(sendto) -COMPAT_SYS_SPU(recv) -COMPAT_SYS_SPU(recvfrom) -SYSCALL_SPU(shutdown) -COMPAT_SYS_SPU(setsockopt) -COMPAT_SYS_SPU(getsockopt) -COMPAT_SYS_SPU(sendmsg) -COMPAT_SYS_SPU(recvmsg) -COMPAT_SYS_SPU(recvmmsg) -SYSCALL_SPU(accept4) -SYSCALL_SPU(name_to_handle_at) -COMPAT_SYS_SPU(open_by_handle_at) -COMPAT_SYS_SPU(clock_adjtime) -SYSCALL_SPU(syncfs) -COMPAT_SYS_SPU(sendmmsg) -SYSCALL_SPU(setns) -COMPAT_SYS(process_vm_readv) -COMPAT_SYS(process_vm_writev) -SYSCALL(finit_module) -SYSCALL(kcmp) /* sys_kcmp */ -SYSCALL_SPU(sched_setattr) -SYSCALL_SPU(sched_getattr) -SYSCALL_SPU(renameat2) -SYSCALL_SPU(seccomp) -SYSCALL_SPU(getrandom) -SYSCALL_SPU(memfd_create) -SYSCALL_SPU(bpf) -COMPAT_SYS(execveat) -PPC64ONLY(switch_endian) -SYSCALL_SPU(userfaultfd) -SYSCALL_SPU(membarrier) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYSCALL(mlock2) -SYSCALL(copy_file_range) -COMPAT_SYS_SPU(preadv2) -COMPAT_SYS_SPU(pwritev2) -SYSCALL(kexec_file_load) -SYSCALL(statx) -SYSCALL(pkey_alloc) -SYSCALL(pkey_free) -SYSCALL(pkey_mprotect) -SYSCALL(rseq) -COMPAT_SYS(io_pgetevents) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b0de85b477e1..a3c35e6d6ffb 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -11,8 +11,7 @@ #include <uapi/asm/unistd.h> - -#define NR_syscalls 389 +#define NR_syscalls __NR_syscalls #define __NR__exit __NR_exit diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index 3712152206f3..8ab8ba1b71bc 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -1,6 +1,8 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm +generated-y += unistd_32.h +generated-y += unistd_64.h generic-y += param.h generic-y += poll.h generic-y += resource.h diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h index 985534d0b448..5f84e3dc98d0 100644 --- a/arch/powerpc/include/uapi/asm/unistd.h +++ b/arch/powerpc/include/uapi/asm/unistd.h @@ -10,395 +10,10 @@ #ifndef _UAPI_ASM_POWERPC_UNISTD_H_ #define _UAPI_ASM_POWERPC_UNISTD_H_ - -#define __NR_restart_syscall 0 -#define __NR_exit 1 -#define __NR_fork 2 -#define __NR_read 3 -#define __NR_write 4 -#define __NR_open 5 -#define __NR_close 6 -#define __NR_waitpid 7 -#define __NR_creat 8 -#define __NR_link 9 -#define __NR_unlink 10 -#define __NR_execve 11 -#define __NR_chdir 12 -#define __NR_time 13 -#define __NR_mknod 14 -#define __NR_chmod 15 -#define __NR_lchown 16 -#define __NR_break 17 -#define __NR_oldstat 18 -#define __NR_lseek 19 -#define __NR_getpid 20 -#define __NR_mount 21 -#define __NR_umount 22 -#define __NR_setuid 23 -#define __NR_getuid 24 -#define __NR_stime 25 -#define __NR_ptrace 26 -#define __NR_alarm 27 -#define __NR_oldfstat 28 -#define __NR_pause 29 -#define __NR_utime 30 -#define __NR_stty 31 -#define __NR_gtty 32 -#define __NR_access 33 -#define __NR_nice 34 -#define __NR_ftime 35 -#define __NR_sync 36 -#define __NR_kill 37 -#define __NR_rename 38 -#define __NR_mkdir 39 -#define __NR_rmdir 40 -#define __NR_dup 41 -#define __NR_pipe 42 -#define __NR_times 43 -#define __NR_prof 44 -#define __NR_brk 45 -#define __NR_setgid 46 -#define __NR_getgid 47 -#define __NR_signal 48 -#define __NR_geteuid 49 -#define __NR_getegid 50 -#define __NR_acct 51 -#define __NR_umount2 52 -#define __NR_lock 53 -#define __NR_ioctl 54 -#define __NR_fcntl 55 -#define __NR_mpx 56 -#define __NR_setpgid 57 -#define __NR_ulimit 58 -#define __NR_oldolduname 59 -#define __NR_umask 60 -#define __NR_chroot 61 -#define __NR_ustat 62 -#define __NR_dup2 63 -#define __NR_getppid 64 -#define __NR_getpgrp 65 -#define __NR_setsid 66 -#define __NR_sigaction 67 -#define __NR_sgetmask 68 -#define __NR_ssetmask 69 -#define __NR_setreuid 70 -#define __NR_setregid 71 -#define __NR_sigsuspend 72 -#define __NR_sigpending 73 -#define __NR_sethostname 74 -#define __NR_setrlimit 75 -#define __NR_getrlimit 76 -#define __NR_getrusage 77 -#define __NR_gettimeofday 78 -#define __NR_settimeofday 79 -#define __NR_getgroups 80 -#define __NR_setgroups 81 -#define __NR_select 82 -#define __NR_symlink 83 -#define __NR_oldlstat 84 -#define __NR_readlink 85 -#define __NR_uselib 86 -#define __NR_swapon 87 -#define __NR_reboot 88 -#define __NR_readdir 89 -#define __NR_mmap 90 -#define __NR_munmap 91 -#define __NR_truncate 92 -#define __NR_ftruncate 93 -#define __NR_fchmod 94 -#define __NR_fchown 95 -#define __NR_getpriority 96 -#define __NR_setpriority 97 -#define __NR_profil 98 -#define __NR_statfs 99 -#define __NR_fstatfs 100 -#define __NR_ioperm 101 -#define __NR_socketcall 102 -#define __NR_syslog 103 -#define __NR_setitimer 104 -#define __NR_getitimer 105 -#define __NR_stat 106 -#define __NR_lstat 107 -#define __NR_fstat 108 -#define __NR_olduname 109 -#define __NR_iopl 110 -#define __NR_vhangup 111 -#define __NR_idle 112 -#define __NR_vm86 113 -#define __NR_wait4 114 -#define __NR_swapoff 115 -#define __NR_sysinfo 116 -#define __NR_ipc 117 -#define __NR_fsync 118 -#define __NR_sigreturn 119 -#define __NR_clone 120 -#define __NR_setdomainname 121 -#define __NR_uname 122 -#define __NR_modify_ldt 123 -#define __NR_adjtimex 124 -#define __NR_mprotect 125 -#define __NR_sigprocmask 126 -#define __NR_create_module 127 -#define __NR_init_module 128 -#define __NR_delete_module 129 -#define __NR_get_kernel_syms 130 -#define __NR_quotactl 131 -#define __NR_getpgid 132 -#define __NR_fchdir 133 -#define __NR_bdflush 134 -#define __NR_sysfs 135 -#define __NR_personality 136 -#define __NR_afs_syscall 137 /* Syscall for Andrew File System */ -#define __NR_setfsuid 138 -#define __NR_setfsgid 139 -#define __NR__llseek 140 -#define __NR_getdents 141 -#define __NR__newselect 142 -#define __NR_flock 143 -#define __NR_msync 144 -#define __NR_readv 145 -#define __NR_writev 146 -#define __NR_getsid 147 -#define __NR_fdatasync 148 -#define __NR__sysctl 149 -#define __NR_mlock 150 -#define __NR_munlock 151 -#define __NR_mlockall 152 -#define __NR_munlockall 153 -#define __NR_sched_setparam 154 -#define __NR_sched_getparam 155 -#define __NR_sched_setscheduler 156 -#define __NR_sched_getscheduler 157 -#define __NR_sched_yield 158 -#define __NR_sched_get_priority_max 159 -#define __NR_sched_get_priority_min 160 -#define __NR_sched_rr_get_interval 161 -#define __NR_nanosleep 162 -#define __NR_mremap 163 -#define __NR_setresuid 164 -#define __NR_getresuid 165 -#define __NR_query_module 166 -#define __NR_poll 167 -#define __NR_nfsservctl 168 -#define __NR_setresgid 169 -#define __NR_getresgid 170 -#define __NR_prctl 171 -#define __NR_rt_sigreturn 172 -#define __NR_rt_sigaction 173 -#define __NR_rt_sigprocmask 174 -#define __NR_rt_sigpending 175 -#define __NR_rt_sigtimedwait 176 -#define __NR_rt_sigqueueinfo 177 -#define __NR_rt_sigsuspend 178 -#define __NR_pread64 179 -#define __NR_pwrite64 180 -#define __NR_chown 181 -#define __NR_getcwd 182 -#define __NR_capget 183 -#define __NR_capset 184 -#define __NR_sigaltstack 185 -#define __NR_sendfile 186 -#define __NR_getpmsg 187 /* some people actually want streams */ -#define __NR_putpmsg 188 /* some people actually want streams */ -#define __NR_vfork 189 -#define __NR_ugetrlimit 190 /* SuS compliant getrlimit */ -#define __NR_readahead 191 -#ifndef __powerpc64__ /* these are 32-bit only */ -#define __NR_mmap2 192 -#define __NR_truncate64 193 -#define __NR_ftruncate64 194 -#define __NR_stat64 195 -#define __NR_lstat64 196 -#define __NR_fstat64 197 -#endif -#define __NR_pciconfig_read 198 -#define __NR_pciconfig_write 199 -#define __NR_pciconfig_iobase 200 -#define __NR_multiplexer 201 -#define __NR_getdents64 202 -#define __NR_pivot_root 203 -#ifndef __powerpc64__ -#define __NR_fcntl64 204 -#endif -#define __NR_madvise 205 -#define __NR_mincore 206 -#define __NR_gettid 207 -#define __NR_tkill 208 -#define __NR_setxattr 209 -#define __NR_lsetxattr 210 -#define __NR_fsetxattr 211 -#define __NR_getxattr 212 -#define __NR_lgetxattr 213 -#define __NR_fgetxattr 214 -#define __NR_listxattr 215 -#define __NR_llistxattr 216 -#define __NR_flistxattr 217 -#define __NR_removexattr 218 -#define __NR_lremovexattr 219 -#define __NR_fremovexattr 220 -#define __NR_futex 221 -#define __NR_sched_setaffinity 222 -#define __NR_sched_getaffinity 223 -/* 224 currently unused */ -#define __NR_tuxcall 225 #ifndef __powerpc64__ -#define __NR_sendfile64 226 -#endif -#define __NR_io_setup 227 -#define __NR_io_destroy 228 -#define __NR_io_getevents 229 -#define __NR_io_submit 230 -#define __NR_io_cancel 231 -#define __NR_set_tid_address 232 -#define __NR_fadvise64 233 -#define __NR_exit_group 234 -#define __NR_lookup_dcookie 235 -#define __NR_epoll_create 236 -#define __NR_epoll_ctl 237 -#define __NR_epoll_wait 238 -#define __NR_remap_file_pages 239 -#define __NR_timer_create 240 -#define __NR_timer_settime 241 -#define __NR_timer_gettime 242 -#define __NR_timer_getoverrun 243 -#define __NR_timer_delete 244 -#define __NR_clock_settime 245 -#define __NR_clock_gettime 246 -#define __NR_clock_getres 247 -#define __NR_clock_nanosleep 248 -#define __NR_swapcontext 249 -#define __NR_tgkill 250 -#define __NR_utimes 251 -#define __NR_statfs64 252 -#define __NR_fstatfs64 253 -#ifndef __powerpc64__ -#define __NR_fadvise64_64 254 -#endif -#define __NR_rtas 255 -#define __NR_sys_debug_setcontext 256 -/* Number 257 is reserved for vserver */ -#define __NR_migrate_pages 258 -#define __NR_mbind 259 -#define __NR_get_mempolicy 260 -#define __NR_set_mempolicy 261 -#define __NR_mq_open 262 -#define __NR_mq_unlink 263 -#define __NR_mq_timedsend 264 -#define __NR_mq_timedreceive 265 -#define __NR_mq_notify 266 -#define __NR_mq_getsetattr 267 -#define __NR_kexec_load 268 -#define __NR_add_key 269 -#define __NR_request_key 270 -#define __NR_keyctl 271 -#define __NR_waitid 272 -#define __NR_ioprio_set 273 -#define __NR_ioprio_get 274 -#define __NR_inotify_init 275 -#define __NR_inotify_add_watch 276 -#define __NR_inotify_rm_watch 277 -#define __NR_spu_run 278 -#define __NR_spu_create 279 -#define __NR_pselect6 280 -#define __NR_ppoll 281 -#define __NR_unshare 282 -#define __NR_splice 283 -#define __NR_tee 284 -#define __NR_vmsplice 285 -#define __NR_openat 286 -#define __NR_mkdirat 287 -#define __NR_mknodat 288 -#define __NR_fchownat 289 -#define __NR_futimesat 290 -#ifdef __powerpc64__ -#define __NR_newfstatat 291 +#include <asm/unistd_32.h> #else -#define __NR_fstatat64 291 +#include <asm/unistd_64.h> #endif -#define __NR_unlinkat 292 -#define __NR_renameat 293 -#define __NR_linkat 294 -#define __NR_symlinkat 295 -#define __NR_readlinkat 296 -#define __NR_fchmodat 297 -#define __NR_faccessat 298 -#define __NR_get_robust_list 299 -#define __NR_set_robust_list 300 -#define __NR_move_pages 301 -#define __NR_getcpu 302 -#define __NR_epoll_pwait 303 -#define __NR_utimensat 304 -#define __NR_signalfd 305 -#define __NR_timerfd_create 306 -#define __NR_eventfd 307 -#define __NR_sync_file_range2 308 -#define __NR_fallocate 309 -#define __NR_subpage_prot 310 -#define __NR_timerfd_settime 311 -#define __NR_timerfd_gettime 312 -#define __NR_signalfd4 313 -#define __NR_eventfd2 314 -#define __NR_epoll_create1 315 -#define __NR_dup3 316 -#define __NR_pipe2 317 -#define __NR_inotify_init1 318 -#define __NR_perf_event_open 319 -#define __NR_preadv 320 -#define __NR_pwritev 321 -#define __NR_rt_tgsigqueueinfo 322 -#define __NR_fanotify_init 323 -#define __NR_fanotify_mark 324 -#define __NR_prlimit64 325 -#define __NR_socket 326 -#define __NR_bind 327 -#define __NR_connect 328 -#define __NR_listen 329 -#define __NR_accept 330 -#define __NR_getsockname 331 -#define __NR_getpeername 332 -#define __NR_socketpair 333 -#define __NR_send 334 -#define __NR_sendto 335 -#define __NR_recv 336 -#define __NR_recvfrom 337 -#define __NR_shutdown 338 -#define __NR_setsockopt 339 -#define __NR_getsockopt 340 -#define __NR_sendmsg 341 -#define __NR_recvmsg 342 -#define __NR_recvmmsg 343 -#define __NR_accept4 344 -#define __NR_name_to_handle_at 345 -#define __NR_open_by_handle_at 346 -#define __NR_clock_adjtime 347 -#define __NR_syncfs 348 -#define __NR_sendmmsg 349 -#define __NR_setns 350 -#define __NR_process_vm_readv 351 -#define __NR_process_vm_writev 352 -#define __NR_finit_module 353 -#define __NR_kcmp 354 -#define __NR_sched_setattr 355 -#define __NR_sched_getattr 356 -#define __NR_renameat2 357 -#define __NR_seccomp 358 -#define __NR_getrandom 359 -#define __NR_memfd_create 360 -#define __NR_bpf 361 -#define __NR_execveat 362 -#define __NR_switch_endian 363 -#define __NR_userfaultfd 364 -#define __NR_membarrier 365 -#define __NR_mlock2 378 -#define __NR_copy_file_range 379 -#define __NR_preadv2 380 -#define __NR_pwritev2 381 -#define __NR_kexec_file_load 382 -#define __NR_statx 383 -#define __NR_pkey_alloc 384 -#define __NR_pkey_free 385 -#define __NR_pkey_mprotect 386 -#define __NR_rseq 387 -#define __NR_io_pgetevents 388 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index a5a6a243f3cf..cb7f0bb9ee71 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -160,16 +160,6 @@ extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o -extra-y += systbl_chk.i -$(obj)/systbl.o: systbl_chk - -quiet_cmd_systbl_chk = CALL $< - cmd_systbl_chk = $(CONFIG_SHELL) $< $(obj)/systbl_chk.i - -PHONY += systbl_chk -systbl_chk: $(src)/systbl_chk.sh $(obj)/systbl_chk.i - $(call cmd,systbl_chk) - ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE $(obj)/built-in.a: prom_init_check diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 7c2032e83ce7..435927f549c4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -54,6 +54,9 @@ SYS_CALL_TABLE: .tc sys_call_table[TC],sys_call_table +COMPAT_SYS_CALL_TABLE: + .tc compat_sys_call_table[TC],compat_sys_call_table + /* This value is used to mark exception frames on the stack. */ exception_marker: .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER @@ -178,7 +181,7 @@ system_call: /* label this so stack traces look sane */ ld r11,SYS_CALL_TABLE@toc(2) andis. r10,r10,_TIF_32BIT@h beq 15f - addi r11,r11,8 /* use 32-bit syscall entries */ + ld r11,COMPAT_SYS_CALL_TABLE@toc(2) clrldi r3,r3,32 clrldi r4,r4,32 clrldi r5,r5,32 @@ -186,7 +189,7 @@ system_call: /* label this so stack traces look sane */ clrldi r7,r7,32 clrldi r8,r8,32 15: - slwi r0,r0,4 + slwi r0,r0,3 barrier_nospec_asm /* @@ -291,6 +294,10 @@ BEGIN_FTR_SECTION HMT_MEDIUM_LOW END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + std r8, PACATMSCRATCH(r13) +#endif + ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ ld r2,GPR2(r1) ld r1,GPR1(r1) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 89d32bb79d5e..ed3d6fa8be53 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1031,7 +1031,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early) EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD - BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */ + BRANCH_LINK_TO_FAR(DOTSYM(hmi_exception_realmode)) /* Function call ABI */ cmpdi cr0,r3,0 /* Windup the stack. */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 7a0da83bf883..45a8d0be1c96 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -35,6 +35,7 @@ #include <linux/kobject.h> #include <linux/sysfs.h> #include <linux/slab.h> +#include <linux/cma.h> #include <asm/debugfs.h> #include <asm/page.h> @@ -46,6 +47,9 @@ static struct fw_dump fw_dump; static struct fadump_mem_struct fdm; static const struct fadump_mem_struct *fdm_active; +#ifdef CONFIG_CMA +static struct cma *fadump_cma; +#endif static DEFINE_MUTEX(fadump_mutex); struct fad_crash_memory_ranges *crash_memory_ranges; @@ -53,6 +57,67 @@ int crash_memory_ranges_size; int crash_mem_ranges; int max_crash_mem_ranges; +#ifdef CONFIG_CMA +/* + * fadump_cma_init() - Initialize CMA area from a fadump reserved memory + * + * This function initializes CMA area from fadump reserved memory. + * The total size of fadump reserved memory covers for boot memory size + * + cpu data size + hpte size and metadata. + * Initialize only the area equivalent to boot memory size for CMA use. + * The reamining portion of fadump reserved memory will be not given + * to CMA and pages for thoes will stay reserved. boot memory size is + * aligned per CMA requirement to satisy cma_init_reserved_mem() call. + * But for some reason even if it fails we still have the memory reservation + * with us and we can still continue doing fadump. + */ +int __init fadump_cma_init(void) +{ + unsigned long long base, size; + int rc; + + if (!fw_dump.fadump_enabled) + return 0; + + /* + * Do not use CMA if user has provided fadump=nocma kernel parameter. + * Return 1 to continue with fadump old behaviour. + */ + if (fw_dump.nocma) + return 1; + + base = fw_dump.reserve_dump_area_start; + size = fw_dump.boot_memory_size; + + if (!size) + return 0; + + rc = cma_init_reserved_mem(base, size, 0, "fadump_cma", &fadump_cma); + if (rc) { + pr_err("Failed to init cma area for firmware-assisted dump,%d\n", rc); + /* + * Though the CMA init has failed we still have memory + * reservation with us. The reserved memory will be + * blocked from production system usage. Hence return 1, + * so that we can continue with fadump. + */ + return 1; + } + + /* + * So we now have successfully initialized cma area for fadump. + */ + pr_info("Initialized 0x%lx bytes cma area at %ldMB from 0x%lx " + "bytes of memory reserved for firmware-assisted dump\n", + cma_get_size(fadump_cma), + (unsigned long)cma_get_base(fadump_cma) >> 20, + fw_dump.reserve_dump_area_size); + return 1; +} +#else +static int __init fadump_cma_init(void) { return 1; } +#endif /* CONFIG_CMA */ + /* Scan the Firmware Assisted dump configuration details. */ int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) @@ -118,13 +183,19 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, /* * If fadump is registered, check if the memory provided - * falls within boot memory area. + * falls within boot memory area and reserved memory area. */ -int is_fadump_boot_memory_area(u64 addr, ulong size) +int is_fadump_memory_area(u64 addr, ulong size) { + u64 d_start = fw_dump.reserve_dump_area_start; + u64 d_end = d_start + fw_dump.reserve_dump_area_size; + if (!fw_dump.dump_registered) return 0; + if (((addr + size) > d_start) && (addr <= d_end)) + return 1; + return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; } @@ -172,6 +243,35 @@ static int is_boot_memory_area_contiguous(void) return ret; } +/* + * Returns true, if there are no holes in reserved memory area, + * false otherwise. + */ +static bool is_reserved_memory_area_contiguous(void) +{ + struct memblock_region *reg; + unsigned long start, end; + unsigned long d_start = fw_dump.reserve_dump_area_start; + unsigned long d_end = d_start + fw_dump.reserve_dump_area_size; + + for_each_memblock(memory, reg) { + start = max(d_start, (unsigned long)reg->base); + end = min(d_end, (unsigned long)(reg->base + reg->size)); + if (d_start < end) { + /* Memory hole from d_start to start */ + if (start > d_start) + break; + + if (end == d_end) + return true; + + d_start = end + 1; + } + } + + return false; +} + /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { @@ -378,8 +478,15 @@ int __init fadump_reserve_mem(void) */ if (fdm_active) fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len); - else + else { fw_dump.boot_memory_size = fadump_calculate_reserve_size(); +#ifdef CONFIG_CMA + if (!fw_dump.nocma) + fw_dump.boot_memory_size = + ALIGN(fw_dump.boot_memory_size, + FADUMP_CMA_ALIGNMENT); +#endif + } /* * Calculate the memory boundary. @@ -426,8 +533,9 @@ int __init fadump_reserve_mem(void) fw_dump.fadumphdr_addr = be64_to_cpu(fdm_active->rmr_region.destination_address) + be64_to_cpu(fdm_active->rmr_region.source_len); - pr_debug("fadumphdr_addr = %p\n", - (void *) fw_dump.fadumphdr_addr); + pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr); + fw_dump.reserve_dump_area_start = base; + fw_dump.reserve_dump_area_size = size; } else { size = get_fadump_area_size(); @@ -455,10 +563,11 @@ int __init fadump_reserve_mem(void) (unsigned long)(size >> 20), (unsigned long)(base >> 20), (unsigned long)(memblock_phys_mem_size() >> 20)); - } - fw_dump.reserve_dump_area_start = base; - fw_dump.reserve_dump_area_size = size; + fw_dump.reserve_dump_area_start = base; + fw_dump.reserve_dump_area_size = size; + return fadump_cma_init(); + } return 1; } @@ -477,6 +586,10 @@ static int __init early_fadump_param(char *p) fw_dump.fadump_enabled = 1; else if (strncmp(p, "off", 3) == 0) fw_dump.fadump_enabled = 0; + else if (strncmp(p, "nocma", 5) == 0) { + fw_dump.fadump_enabled = 1; + fw_dump.nocma = 1; + } return 0; } @@ -525,8 +638,10 @@ static int register_fw_dump(struct fadump_mem_struct *fdm) break; case -3: if (!is_boot_memory_area_contiguous()) - pr_err("Can't have holes in boot memory area while " - "registering fadump\n"); + pr_err("Can't have holes in boot memory area while registering fadump\n"); + else if (!is_reserved_memory_area_contiguous()) + pr_err("Can't have holes in reserved memory area while" + " registering fadump\n"); printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); @@ -1229,7 +1344,7 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm) return 0; } -static int fadump_invalidate_dump(struct fadump_mem_struct *fdm) +static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm) { int rc = 0; unsigned int wait_time; @@ -1260,9 +1375,8 @@ void fadump_cleanup(void) { /* Invalidate the registration only if dump is active. */ if (fw_dump.dump_active) { - init_fadump_mem_struct(&fdm, - be64_to_cpu(fdm_active->cpu_state_data.destination_address)); - fadump_invalidate_dump(&fdm); + /* pass the same memory dump structure provided by platform */ + fadump_invalidate_dump(fdm_active); } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ fadump_unregister_dump(&fdm); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index f0dc680e659a..9d5d109f15c0 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -47,6 +47,7 @@ #include <asm/fadump.h> #include <asm/vio.h> #include <asm/tce.h> +#include <asm/mmu_context.h> #define DBG(...) @@ -993,15 +994,19 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) } EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); -long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, - unsigned long *hpa, enum dma_data_direction *direction) +long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction) { long ret; + unsigned long size = 0; ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); if (!ret && ((*direction == DMA_FROM_DEVICE) || - (*direction == DMA_BIDIRECTIONAL))) + (*direction == DMA_BIDIRECTIONAL)) && + !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, + &size)) SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); /* if (unlikely(ret)) @@ -1073,11 +1078,8 @@ void iommu_release_ownership(struct iommu_table *tbl) } EXPORT_SYMBOL_GPL(iommu_release_ownership); -int iommu_add_device(struct device *dev) +int iommu_add_device(struct iommu_table_group *table_group, struct device *dev) { - struct iommu_table *tbl; - struct iommu_table_group_link *tgl; - /* * The sysfs entries should be populated before * binding IOMMU group. If sysfs entries isn't @@ -1093,32 +1095,10 @@ int iommu_add_device(struct device *dev) return -EBUSY; } - tbl = get_iommu_table_base(dev); - if (!tbl) { - pr_debug("%s: Skipping device %s with no tbl\n", - __func__, dev_name(dev)); - return 0; - } - - tgl = list_first_entry_or_null(&tbl->it_group_list, - struct iommu_table_group_link, next); - if (!tgl) { - pr_debug("%s: Skipping device %s with no group\n", - __func__, dev_name(dev)); - return 0; - } pr_debug("%s: Adding %s to iommu group %d\n", - __func__, dev_name(dev), - iommu_group_id(tgl->table_group->group)); - - if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { - pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n", - __func__, IOMMU_PAGE_SIZE(tbl), - PAGE_SIZE, dev_name(dev)); - return -EINVAL; - } + __func__, dev_name(dev), iommu_group_id(table_group->group)); - return iommu_group_add_device(tgl->table_group->group, dev); + return iommu_group_add_device(table_group->group, dev); } EXPORT_SYMBOL_GPL(iommu_add_device); @@ -1138,31 +1118,4 @@ void iommu_del_device(struct device *dev) iommu_group_remove_device(dev); } EXPORT_SYMBOL_GPL(iommu_del_device); - -static int tce_iommu_bus_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - - switch (action) { - case BUS_NOTIFY_ADD_DEVICE: - return iommu_add_device(dev); - case BUS_NOTIFY_DEL_DEVICE: - if (dev->iommu_group) - iommu_del_device(dev); - return 0; - default: - return 0; - } -} - -static struct notifier_block tce_iommu_bus_nb = { - .notifier_call = tce_iommu_bus_notifier, -}; - -int __init tce_iommu_bus_notifier_init(void) -{ - bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); - return 0; -} #endif /* CONFIG_IOMMU_API */ diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index fde632335a97..7cea5978f21f 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -400,8 +400,7 @@ void __init find_legacy_serial_ports(void) /* Next, fill our array with ISA ports */ for_each_node_by_type(np, "serial") { struct device_node *isa = of_get_parent(np); - if (isa && (!strcmp(isa->name, "isa") || - !strcmp(isa->name, "lpc"))) { + if (of_node_name_eq(isa, "isa") || of_node_name_eq(isa, "lpc")) { if (of_device_is_available(np)) { index = add_legacy_isa_port(np, isa); if (index >= 0 && np == stdout) @@ -415,11 +414,11 @@ void __init find_legacy_serial_ports(void) /* Next, try to locate PCI ports */ for (np = NULL; (np = of_find_all_nodes(np));) { struct device_node *pci, *parent = of_get_parent(np); - if (parent && !strcmp(parent->name, "isa")) { + if (of_node_name_eq(parent, "isa")) { of_node_put(parent); continue; } - if (strcmp(np->name, "serial") && + if (!of_node_name_eq(np, "serial") && !of_node_is_type(np, "serial")) { of_node_put(parent); continue; diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 9d39e0eb03ff..2d47cc79e5b3 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -848,7 +848,23 @@ static long restore_tm_user_regs(struct pt_regs *regs, /* If TM bits are set to the reserved value, it's an invalid context */ if (MSR_TM_RESV(msr_hi)) return 1; - /* Pull in the MSR TM bits from the user context */ + + /* + * Disabling preemption, since it is unsafe to be preempted + * with MSR[TS] set without recheckpointing. + */ + preempt_disable(); + + /* + * CAUTION: + * After regs->MSR[TS] being updated, make sure that get_user(), + * put_user() or similar functions are *not* called. These + * functions can generate page faults which will cause the process + * to be de-scheduled with MSR[TS] set but without calling + * tm_recheckpoint(). This can cause a bug. + * + * Pull in the MSR TM bits from the user context + */ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr_hi & MSR_TS_MASK); /* Now, recheckpoint. This loads up all of the checkpointed (older) * registers, including FP and V[S]Rs. After recheckpointing, the @@ -873,6 +889,8 @@ static long restore_tm_user_regs(struct pt_regs *regs, } #endif + preempt_enable(); + return 0; } #endif @@ -1140,11 +1158,11 @@ SYSCALL_DEFINE0(rt_sigreturn) { struct rt_sigframe __user *rt_sf; struct pt_regs *regs = current_pt_regs(); + int tm_restore = 0; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM struct ucontext __user *uc_transact; unsigned long msr_hi; unsigned long tmp; - int tm_restore = 0; #endif /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; @@ -1192,11 +1210,19 @@ SYSCALL_DEFINE0(rt_sigreturn) goto bad; } } - if (!tm_restore) - /* Fall through, for non-TM restore */ + if (!tm_restore) { + /* + * Unset regs->msr because ucontext MSR TS is not + * set, and recheckpoint was not called. This avoid + * hitting a TM Bad thing at RFID + */ + regs->msr &= ~MSR_TS_MASK; + } + /* Fall through, for non-TM restore */ #endif - if (do_setcontext(&rt_sf->uc, regs, 1)) - goto bad; + if (!tm_restore) + if (do_setcontext(&rt_sf->uc, regs, 1)) + goto bad; /* * It's not clear whether or why it is desirable to save the diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index e53ad11be385..0935fe6c282a 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -467,20 +467,6 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, if (MSR_TM_RESV(msr)) return -EINVAL; - /* pull in MSR TS bits from user context */ - regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); - - /* - * Ensure that TM is enabled in regs->msr before we leave the signal - * handler. It could be the case that (a) user disabled the TM bit - * through the manipulation of the MSR bits in uc_mcontext or (b) the - * TM bit was disabled because a sufficient number of context switches - * happened whilst in the signal handler and load_tm overflowed, - * disabling the TM bit. In either case we can end up with an illegal - * TM state leading to a TM Bad Thing when we return to userspace. - */ - regs->msr |= MSR_TM; - /* pull in MSR LE from user context */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); @@ -572,6 +558,34 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, tm_enable(); /* Make sure the transaction is marked as failed */ tsk->thread.tm_texasr |= TEXASR_FS; + + /* + * Disabling preemption, since it is unsafe to be preempted + * with MSR[TS] set without recheckpointing. + */ + preempt_disable(); + + /* pull in MSR TS bits from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + + /* + * Ensure that TM is enabled in regs->msr before we leave the signal + * handler. It could be the case that (a) user disabled the TM bit + * through the manipulation of the MSR bits in uc_mcontext or (b) the + * TM bit was disabled because a sufficient number of context switches + * happened whilst in the signal handler and load_tm overflowed, + * disabling the TM bit. In either case we can end up with an illegal + * TM state leading to a TM Bad Thing when we return to userspace. + * + * CAUTION: + * After regs->MSR[TS] being updated, make sure that get_user(), + * put_user() or similar functions are *not* called. These + * functions can generate page faults which will cause the process + * to be de-scheduled with MSR[TS] set but without calling + * tm_recheckpoint(). This can cause a bug. + */ + regs->msr |= MSR_TM; + /* This loads the checkpointed FP/VEC state, if used */ tm_recheckpoint(&tsk->thread); @@ -585,6 +599,8 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, regs->msr |= MSR_VEC; } + preempt_enable(); + return err; } #endif @@ -741,11 +757,23 @@ SYSCALL_DEFINE0(rt_sigreturn) &uc_transact->uc_mcontext)) goto badframe; } - else - /* Fall through, for non-TM restore */ #endif - if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext)) - goto badframe; + /* Fall through, for non-TM restore */ + if (!MSR_TM_ACTIVE(msr)) { + /* + * Unset MSR[TS] on the thread regs since MSR from user + * context does not have MSR active, and recheckpoint was + * not called since restore_tm_sigcontexts() was not called + * also. + * + * If not unsetting it, the code can RFID to userspace with + * MSR[TS] set, but without CPU in the proper state, + * causing a TM bad thing. + */ + current->thread.regs->msr &= ~MSR_TS_MASK; + if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext)) + goto badframe; + } if (restore_altstack(&uc->uc_stack)) goto badframe; diff --git a/arch/powerpc/kernel/syscalls/Makefile b/arch/powerpc/kernel/syscalls/Makefile new file mode 100644 index 000000000000..27b48954808d --- /dev/null +++ b/arch/powerpc/kernel/syscalls/Makefile @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: GPL-2.0 +kapi := arch/$(SRCARCH)/include/generated/asm +uapi := arch/$(SRCARCH)/include/generated/uapi/asm + +_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ + $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') + +syscall := $(srctree)/$(src)/syscall.tbl +syshdr := $(srctree)/$(src)/syscallhdr.sh +systbl := $(srctree)/$(src)/syscalltbl.sh + +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ + '$(syshdr_abis_$(basetarget))' \ + '$(syshdr_pfx_$(basetarget))' \ + '$(syshdr_offset_$(basetarget))' + +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ + '$(systbl_abis_$(basetarget))' \ + '$(systbl_abi_$(basetarget))' \ + '$(systbl_offset_$(basetarget))' + +syshdr_abis_unistd_32 := common,nospu,32 +$(uapi)/unistd_32.h: $(syscall) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abis_unistd_64 := common,nospu,64 +$(uapi)/unistd_64.h: $(syscall) $(syshdr) + $(call if_changed,syshdr) + +systbl_abis_syscall_table_32 := common,nospu,32 +systbl_abi_syscall_table_32 := 32 +$(kapi)/syscall_table_32.h: $(syscall) $(systbl) + $(call if_changed,systbl) + +systbl_abis_syscall_table_64 := common,nospu,64 +systbl_abi_syscall_table_64 := 64 +$(kapi)/syscall_table_64.h: $(syscall) $(systbl) + $(call if_changed,systbl) + +systbl_abis_syscall_table_c32 := common,nospu,32 +systbl_abi_syscall_table_c32 := c32 +$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) + $(call if_changed,systbl) + +systbl_abis_syscall_table_spu := common,spu +systbl_abi_syscall_table_spu := spu +$(kapi)/syscall_table_spu.h: $(syscall) $(systbl) + $(call if_changed,systbl) + +uapisyshdr-y += unistd_32.h unistd_64.h +kapisyshdr-y += syscall_table_32.h \ + syscall_table_64.h \ + syscall_table_c32.h \ + syscall_table_spu.h + +targets += $(uapisyshdr-y) $(kapisyshdr-y) + +PHONY += all +all: $(addprefix $(uapi)/,$(uapisyshdr-y)) +all: $(addprefix $(kapi)/,$(kapisyshdr-y)) + @: diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl new file mode 100644 index 000000000000..db3bbb8744af --- /dev/null +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -0,0 +1,427 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# system call numbers and entry vectors for powerpc +# +# The format is: +# <number> <abi> <name> <entry point> <compat entry point> +# +# The <abi> can be common, spu, nospu, 64, or 32 for this file. +# +0 nospu restart_syscall sys_restart_syscall +1 nospu exit sys_exit +2 nospu fork ppc_fork +3 common read sys_read +4 common write sys_write +5 common open sys_open compat_sys_open +6 common close sys_close +7 common waitpid sys_waitpid +8 common creat sys_creat +9 common link sys_link +10 common unlink sys_unlink +11 nospu execve sys_execve compat_sys_execve +12 common chdir sys_chdir +13 common time sys_time compat_sys_time +14 common mknod sys_mknod +15 common chmod sys_chmod +16 common lchown sys_lchown +17 common break sys_ni_syscall +18 32 oldstat sys_stat sys_ni_syscall +18 64 oldstat sys_ni_syscall +18 spu oldstat sys_ni_syscall +19 common lseek sys_lseek compat_sys_lseek +20 common getpid sys_getpid +21 nospu mount sys_mount compat_sys_mount +22 32 umount sys_oldumount +22 64 umount sys_ni_syscall +22 spu umount sys_ni_syscall +23 common setuid sys_setuid +24 common getuid sys_getuid +25 common stime sys_stime compat_sys_stime +26 nospu ptrace sys_ptrace compat_sys_ptrace +27 common alarm sys_alarm +28 32 oldfstat sys_fstat sys_ni_syscall +28 64 oldfstat sys_ni_syscall +28 spu oldfstat sys_ni_syscall +29 nospu pause sys_pause +30 nospu utime sys_utime compat_sys_utime +31 common stty sys_ni_syscall +32 common gtty sys_ni_syscall +33 common access sys_access +34 common nice sys_nice +35 common ftime sys_ni_syscall +36 common sync sys_sync +37 common kill sys_kill +38 common rename sys_rename +39 common mkdir sys_mkdir +40 common rmdir sys_rmdir +41 common dup sys_dup +42 common pipe sys_pipe +43 common times sys_times compat_sys_times +44 common prof sys_ni_syscall +45 common brk sys_brk +46 common setgid sys_setgid +47 common getgid sys_getgid +48 nospu signal sys_signal +49 common geteuid sys_geteuid +50 common getegid sys_getegid +51 nospu acct sys_acct +52 nospu umount2 sys_umount +53 common lock sys_ni_syscall +54 common ioctl sys_ioctl compat_sys_ioctl +55 common fcntl sys_fcntl compat_sys_fcntl +56 common mpx sys_ni_syscall +57 common setpgid sys_setpgid +58 common ulimit sys_ni_syscall +59 32 oldolduname sys_olduname +59 64 oldolduname sys_ni_syscall +59 spu oldolduname sys_ni_syscall +60 common umask sys_umask +61 common chroot sys_chroot +62 nospu ustat sys_ustat compat_sys_ustat +63 common dup2 sys_dup2 +64 common getppid sys_getppid +65 common getpgrp sys_getpgrp +66 common setsid sys_setsid +67 32 sigaction sys_sigaction compat_sys_sigaction +67 64 sigaction sys_ni_syscall +67 spu sigaction sys_ni_syscall +68 common sgetmask sys_sgetmask +69 common ssetmask sys_ssetmask +70 common setreuid sys_setreuid +71 common setregid sys_setregid +72 32 sigsuspend sys_sigsuspend +72 64 sigsuspend sys_ni_syscall +72 spu sigsuspend sys_ni_syscall +73 32 sigpending sys_sigpending compat_sys_sigpending +73 64 sigpending sys_ni_syscall +73 spu sigpending sys_ni_syscall +74 common sethostname sys_sethostname +75 common setrlimit sys_setrlimit compat_sys_setrlimit +76 32 getrlimit sys_old_getrlimit compat_sys_old_getrlimit +76 64 getrlimit sys_ni_syscall +76 spu getrlimit sys_ni_syscall +77 common getrusage sys_getrusage compat_sys_getrusage +78 common gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 common settimeofday sys_settimeofday compat_sys_settimeofday +80 common getgroups sys_getgroups +81 common setgroups sys_setgroups +82 32 select ppc_select sys_ni_syscall +82 64 select sys_ni_syscall +82 spu select sys_ni_syscall +83 common symlink sys_symlink +84 32 oldlstat sys_lstat sys_ni_syscall +84 64 oldlstat sys_ni_syscall +84 spu oldlstat sys_ni_syscall +85 common readlink sys_readlink +86 nospu uselib sys_uselib +87 nospu swapon sys_swapon +88 nospu reboot sys_reboot +89 32 readdir sys_old_readdir compat_sys_old_readdir +89 64 readdir sys_ni_syscall +89 spu readdir sys_ni_syscall +90 common mmap sys_mmap +91 common munmap sys_munmap +92 common truncate sys_truncate compat_sys_truncate +93 common ftruncate sys_ftruncate compat_sys_ftruncate +94 common fchmod sys_fchmod +95 common fchown sys_fchown +96 common getpriority sys_getpriority +97 common setpriority sys_setpriority +98 common profil sys_ni_syscall +99 nospu statfs sys_statfs compat_sys_statfs +100 nospu fstatfs sys_fstatfs compat_sys_fstatfs +101 common ioperm sys_ni_syscall +102 common socketcall sys_socketcall compat_sys_socketcall +103 common syslog sys_syslog +104 common setitimer sys_setitimer compat_sys_setitimer +105 common getitimer sys_getitimer compat_sys_getitimer +106 common stat sys_newstat compat_sys_newstat +107 common lstat sys_newlstat compat_sys_newlstat +108 common fstat sys_newfstat compat_sys_newfstat +109 32 olduname sys_uname +109 64 olduname sys_ni_syscall +109 spu olduname sys_ni_syscall +110 common iopl sys_ni_syscall +111 common vhangup sys_vhangup +112 common idle sys_ni_syscall +113 common vm86 sys_ni_syscall +114 common wait4 sys_wait4 compat_sys_wait4 +115 nospu swapoff sys_swapoff +116 common sysinfo sys_sysinfo compat_sys_sysinfo +117 nospu ipc sys_ipc compat_sys_ipc +118 common fsync sys_fsync +119 32 sigreturn sys_sigreturn compat_sys_sigreturn +119 64 sigreturn sys_ni_syscall +119 spu sigreturn sys_ni_syscall +120 nospu clone ppc_clone +121 common setdomainname sys_setdomainname +122 common uname sys_newuname +123 common modify_ldt sys_ni_syscall +124 common adjtimex sys_adjtimex compat_sys_adjtimex +125 common mprotect sys_mprotect +126 32 sigprocmask sys_sigprocmask compat_sys_sigprocmask +126 64 sigprocmask sys_ni_syscall +126 spu sigprocmask sys_ni_syscall +127 common create_module sys_ni_syscall +128 nospu init_module sys_init_module +129 nospu delete_module sys_delete_module +130 common get_kernel_syms sys_ni_syscall +131 nospu quotactl sys_quotactl +132 common getpgid sys_getpgid +133 common fchdir sys_fchdir +134 common bdflush sys_bdflush +135 common sysfs sys_sysfs +136 32 personality sys_personality ppc64_personality +136 64 personality ppc64_personality +136 spu personality ppc64_personality +137 common afs_syscall sys_ni_syscall +138 common setfsuid sys_setfsuid +139 common setfsgid sys_setfsgid +140 common _llseek sys_llseek +141 common getdents sys_getdents compat_sys_getdents +142 common _newselect sys_select compat_sys_select +143 common flock sys_flock +144 common msync sys_msync +145 common readv sys_readv compat_sys_readv +146 common writev sys_writev compat_sys_writev +147 common getsid sys_getsid +148 common fdatasync sys_fdatasync +149 nospu _sysctl sys_sysctl compat_sys_sysctl +150 common mlock sys_mlock +151 common munlock sys_munlock +152 common mlockall sys_mlockall +153 common munlockall sys_munlockall +154 common sched_setparam sys_sched_setparam +155 common sched_getparam sys_sched_getparam +156 common sched_setscheduler sys_sched_setscheduler +157 common sched_getscheduler sys_sched_getscheduler +158 common sched_yield sys_sched_yield +159 common sched_get_priority_max sys_sched_get_priority_max +160 common sched_get_priority_min sys_sched_get_priority_min +161 common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval +162 common nanosleep sys_nanosleep compat_sys_nanosleep +163 common mremap sys_mremap +164 common setresuid sys_setresuid +165 common getresuid sys_getresuid +166 common query_module sys_ni_syscall +167 common poll sys_poll +168 common nfsservctl sys_ni_syscall +169 common setresgid sys_setresgid +170 common getresgid sys_getresgid +171 common prctl sys_prctl +172 nospu rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn +173 nospu rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction +174 nospu rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask +175 nospu rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending +176 nospu rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo +178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend +179 common pread64 sys_pread64 compat_sys_pread64 +180 common pwrite64 sys_pwrite64 compat_sys_pwrite64 +181 common chown sys_chown +182 common getcwd sys_getcwd +183 common capget sys_capget +184 common capset sys_capset +185 nospu sigaltstack sys_sigaltstack compat_sys_sigaltstack +186 32 sendfile sys_sendfile compat_sys_sendfile +186 64 sendfile sys_sendfile64 +186 spu sendfile sys_sendfile64 +187 common getpmsg sys_ni_syscall +188 common putpmsg sys_ni_syscall +189 nospu vfork ppc_vfork +190 common ugetrlimit sys_getrlimit compat_sys_getrlimit +191 common readahead sys_readahead compat_sys_readahead +192 32 mmap2 sys_mmap2 compat_sys_mmap2 +193 32 truncate64 sys_truncate64 compat_sys_truncate64 +194 32 ftruncate64 sys_ftruncate64 compat_sys_ftruncate64 +195 32 stat64 sys_stat64 +196 32 lstat64 sys_lstat64 +197 32 fstat64 sys_fstat64 +198 nospu pciconfig_read sys_pciconfig_read +199 nospu pciconfig_write sys_pciconfig_write +200 nospu pciconfig_iobase sys_pciconfig_iobase +201 common multiplexer sys_ni_syscall +202 common getdents64 sys_getdents64 +203 common pivot_root sys_pivot_root +204 32 fcntl64 sys_fcntl64 compat_sys_fcntl64 +205 common madvise sys_madvise +206 common mincore sys_mincore +207 common gettid sys_gettid +208 common tkill sys_tkill +209 common setxattr sys_setxattr +210 common lsetxattr sys_lsetxattr +211 common fsetxattr sys_fsetxattr +212 common getxattr sys_getxattr +213 common lgetxattr sys_lgetxattr +214 common fgetxattr sys_fgetxattr +215 common listxattr sys_listxattr +216 common llistxattr sys_llistxattr +217 common flistxattr sys_flistxattr +218 common removexattr sys_removexattr +219 common lremovexattr sys_lremovexattr +220 common fremovexattr sys_fremovexattr +221 common futex sys_futex compat_sys_futex +222 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +223 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +# 224 unused +225 common tuxcall sys_ni_syscall +226 32 sendfile64 sys_sendfile64 compat_sys_sendfile64 +227 common io_setup sys_io_setup compat_sys_io_setup +228 common io_destroy sys_io_destroy +229 common io_getevents sys_io_getevents compat_sys_io_getevents +230 common io_submit sys_io_submit compat_sys_io_submit +231 common io_cancel sys_io_cancel +232 nospu set_tid_address sys_set_tid_address +233 common fadvise64 sys_fadvise64 ppc32_fadvise64 +234 nospu exit_group sys_exit_group +235 nospu lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +236 common epoll_create sys_epoll_create +237 common epoll_ctl sys_epoll_ctl +238 common epoll_wait sys_epoll_wait +239 common remap_file_pages sys_remap_file_pages +240 common timer_create sys_timer_create compat_sys_timer_create +241 common timer_settime sys_timer_settime compat_sys_timer_settime +242 common timer_gettime sys_timer_gettime compat_sys_timer_gettime +243 common timer_getoverrun sys_timer_getoverrun +244 common timer_delete sys_timer_delete +245 common clock_settime sys_clock_settime compat_sys_clock_settime +246 common clock_gettime sys_clock_gettime compat_sys_clock_gettime +247 common clock_getres sys_clock_getres compat_sys_clock_getres +248 common clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +249 32 swapcontext ppc_swapcontext ppc32_swapcontext +249 64 swapcontext ppc64_swapcontext +249 spu swapcontext sys_ni_syscall +250 common tgkill sys_tgkill +251 common utimes sys_utimes compat_sys_utimes +252 common statfs64 sys_statfs64 compat_sys_statfs64 +253 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +254 32 fadvise64_64 ppc_fadvise64_64 +254 spu fadvise64_64 sys_ni_syscall +255 common rtas sys_rtas +256 32 sys_debug_setcontext sys_debug_setcontext sys_ni_syscall +256 64 sys_debug_setcontext sys_ni_syscall +256 spu sys_debug_setcontext sys_ni_syscall +# 257 reserved for vserver +258 nospu migrate_pages sys_migrate_pages compat_sys_migrate_pages +259 nospu mbind sys_mbind compat_sys_mbind +260 nospu get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +261 nospu set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +262 nospu mq_open sys_mq_open compat_sys_mq_open +263 nospu mq_unlink sys_mq_unlink +264 nospu mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend +265 nospu mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +266 nospu mq_notify sys_mq_notify compat_sys_mq_notify +267 nospu mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr +268 nospu kexec_load sys_kexec_load compat_sys_kexec_load +269 nospu add_key sys_add_key +270 nospu request_key sys_request_key +271 nospu keyctl sys_keyctl compat_sys_keyctl +272 nospu waitid sys_waitid compat_sys_waitid +273 nospu ioprio_set sys_ioprio_set +274 nospu ioprio_get sys_ioprio_get +275 nospu inotify_init sys_inotify_init +276 nospu inotify_add_watch sys_inotify_add_watch +277 nospu inotify_rm_watch sys_inotify_rm_watch +278 nospu spu_run sys_spu_run +279 nospu spu_create sys_spu_create +280 nospu pselect6 sys_pselect6 compat_sys_pselect6 +281 nospu ppoll sys_ppoll compat_sys_ppoll +282 common unshare sys_unshare +283 common splice sys_splice +284 common tee sys_tee +285 common vmsplice sys_vmsplice compat_sys_vmsplice +286 common openat sys_openat compat_sys_openat +287 common mkdirat sys_mkdirat +288 common mknodat sys_mknodat +289 common fchownat sys_fchownat +290 common futimesat sys_futimesat compat_sys_futimesat +291 32 fstatat64 sys_fstatat64 +291 64 newfstatat sys_newfstatat +291 spu newfstatat sys_newfstatat +292 common unlinkat sys_unlinkat +293 common renameat sys_renameat +294 common linkat sys_linkat +295 common symlinkat sys_symlinkat +296 common readlinkat sys_readlinkat +297 common fchmodat sys_fchmodat +298 common faccessat sys_faccessat +299 common get_robust_list sys_get_robust_list compat_sys_get_robust_list +300 common set_robust_list sys_set_robust_list compat_sys_set_robust_list +301 common move_pages sys_move_pages compat_sys_move_pages +302 common getcpu sys_getcpu +303 nospu epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait +304 common utimensat sys_utimensat compat_sys_utimensat +305 common signalfd sys_signalfd compat_sys_signalfd +306 common timerfd_create sys_timerfd_create +307 common eventfd sys_eventfd +308 common sync_file_range2 sys_sync_file_range2 compat_sys_sync_file_range2 +309 nospu fallocate sys_fallocate compat_sys_fallocate +310 nospu subpage_prot sys_subpage_prot +311 common timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +312 common timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +313 common signalfd4 sys_signalfd4 compat_sys_signalfd4 +314 common eventfd2 sys_eventfd2 +315 common epoll_create1 sys_epoll_create1 +316 common dup3 sys_dup3 +317 common pipe2 sys_pipe2 +318 nospu inotify_init1 sys_inotify_init1 +319 common perf_event_open sys_perf_event_open +320 common preadv sys_preadv compat_sys_preadv +321 common pwritev sys_pwritev compat_sys_pwritev +322 nospu rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +323 nospu fanotify_init sys_fanotify_init +324 nospu fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark +325 common prlimit64 sys_prlimit64 +326 common socket sys_socket +327 common bind sys_bind +328 common connect sys_connect +329 common listen sys_listen +330 common accept sys_accept +331 common getsockname sys_getsockname +332 common getpeername sys_getpeername +333 common socketpair sys_socketpair +334 common send sys_send +335 common sendto sys_sendto +336 common recv sys_recv compat_sys_recv +337 common recvfrom sys_recvfrom compat_sys_recvfrom +338 common shutdown sys_shutdown +339 common setsockopt sys_setsockopt compat_sys_setsockopt +340 common getsockopt sys_getsockopt compat_sys_getsockopt +341 common sendmsg sys_sendmsg compat_sys_sendmsg +342 common recvmsg sys_recvmsg compat_sys_recvmsg +343 common recvmmsg sys_recvmmsg compat_sys_recvmmsg +344 common accept4 sys_accept4 +345 common name_to_handle_at sys_name_to_handle_at +346 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +347 common clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +348 common syncfs sys_syncfs +349 common sendmmsg sys_sendmmsg compat_sys_sendmmsg +350 common setns sys_setns +351 nospu process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +352 nospu process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +353 nospu finit_module sys_finit_module +354 nospu kcmp sys_kcmp +355 common sched_setattr sys_sched_setattr +356 common sched_getattr sys_sched_getattr +357 common renameat2 sys_renameat2 +358 common seccomp sys_seccomp +359 common getrandom sys_getrandom +360 common memfd_create sys_memfd_create +361 common bpf sys_bpf +362 nospu execveat sys_execveat compat_sys_execveat +363 32 switch_endian sys_ni_syscall +363 64 switch_endian ppc_switch_endian +363 spu switch_endian sys_ni_syscall +364 common userfaultfd sys_userfaultfd +365 common membarrier sys_membarrier +378 nospu mlock2 sys_mlock2 +379 nospu copy_file_range sys_copy_file_range +380 common preadv2 sys_preadv2 compat_sys_preadv2 +381 common pwritev2 sys_pwritev2 compat_sys_pwritev2 +382 nospu kexec_file_load sys_kexec_file_load +383 nospu statx sys_statx +384 nospu pkey_alloc sys_pkey_alloc +385 nospu pkey_free sys_pkey_free +386 nospu pkey_mprotect sys_pkey_mprotect +387 nospu rseq sys_rseq +388 nospu io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents diff --git a/arch/powerpc/kernel/syscalls/syscallhdr.sh b/arch/powerpc/kernel/syscalls/syscallhdr.sh new file mode 100644 index 000000000000..c0a9a32937f1 --- /dev/null +++ b/arch/powerpc/kernel/syscalls/syscallhdr.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +in="$1" +out="$2" +my_abis=`echo "($3)" | tr ',' '|'` +prefix="$4" +offset="$5" + +fileguard=_UAPI_ASM_POWERPC_`basename "$out" | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( + printf "#ifndef %s\n" "${fileguard}" + printf "#define %s\n" "${fileguard}" + printf "\n" + + nxt=0 + while read nr abi name entry compat ; do + if [ -z "$offset" ]; then + printf "#define __NR_%s%s\t%s\n" \ + "${prefix}" "${name}" "${nr}" + else + printf "#define __NR_%s%s\t(%s + %s)\n" \ + "${prefix}" "${name}" "${offset}" "${nr}" + fi + nxt=$((nr+1)) + done + + printf "\n" + printf "#ifdef __KERNEL__\n" + printf "#define __NR_syscalls\t%s\n" "${nxt}" + printf "#endif\n" + printf "\n" + printf "#endif /* %s */" "${fileguard}" + printf "\n" +) > "$out" diff --git a/arch/powerpc/kernel/syscalls/syscalltbl.sh b/arch/powerpc/kernel/syscalls/syscalltbl.sh new file mode 100644 index 000000000000..fd620490a542 --- /dev/null +++ b/arch/powerpc/kernel/syscalls/syscalltbl.sh @@ -0,0 +1,36 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +in="$1" +out="$2" +my_abis=`echo "($3)" | tr ',' '|'` +my_abi="$4" +offset="$5" + +emit() { + t_nxt="$1" + t_nr="$2" + t_entry="$3" + + while [ $t_nxt -lt $t_nr ]; do + printf "__SYSCALL(%s,sys_ni_syscall, )\n" "${t_nxt}" + t_nxt=$((t_nxt+1)) + done + printf "__SYSCALL(%s,%s, )\n" "${t_nxt}" "${t_entry}" +} + +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( + nxt=0 + if [ -z "$offset" ]; then + offset=0 + fi + + while read nr abi name entry compat ; do + if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then + emit $((nxt+offset)) $((nr+offset)) $compat + else + emit $((nxt+offset)) $((nr+offset)) $entry + fi + nxt=$((nr+1)) + done +) > "$out" diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 919a32746ede..23265a28740b 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -16,28 +16,6 @@ #include <asm/ppc_asm.h> -#ifdef CONFIG_PPC64 -#define SYSCALL(func) .8byte DOTSYM(sys_##func),DOTSYM(sys_##func) -#define COMPAT_SYS(func) .8byte DOTSYM(sys_##func),DOTSYM(compat_sys_##func) -#define PPC_SYS(func) .8byte DOTSYM(ppc_##func),DOTSYM(ppc_##func) -#define OLDSYS(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) -#define SYS32ONLY(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) -#define PPC64ONLY(func) .8byte DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall) -#define SYSX(f, f3264, f32) .8byte DOTSYM(f),DOTSYM(f3264) -#else -#define SYSCALL(func) .long sys_##func -#define COMPAT_SYS(func) .long sys_##func -#define PPC_SYS(func) .long ppc_##func -#define OLDSYS(func) .long sys_##func -#define SYS32ONLY(func) .long sys_##func -#define PPC64ONLY(func) .long sys_ni_syscall -#define SYSX(f, f3264, f32) .long f32 -#endif -#define SYSCALL_SPU(func) SYSCALL(func) -#define COMPAT_SYS_SPU(func) COMPAT_SYS(func) -#define COMPAT_SPU_NEW(func) COMPAT_SYS(func) -#define SYSX_SPU(f, f3264, f32) SYSX(f, f3264, f32) - .section .rodata,"a" #ifdef CONFIG_PPC64 @@ -46,5 +24,21 @@ .globl sys_call_table sys_call_table: +#ifdef CONFIG_PPC64 +#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) +#include <asm/syscall_table_64.h> +#undef __SYSCALL +#else +#define __SYSCALL(nr, entry, nargs) .long entry +#include <asm/syscall_table_32.h> +#undef __SYSCALL +#endif -#include <asm/systbl.h> +#ifdef CONFIG_COMPAT +.globl compat_sys_call_table +compat_sys_call_table: +#define compat_sys_sigsuspend sys_sigsuspend +#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) +#include <asm/syscall_table_c32.h> +#undef __SYSCALL +#endif diff --git a/arch/powerpc/kernel/systbl_chk.c b/arch/powerpc/kernel/systbl_chk.c deleted file mode 100644 index 4653258722ac..000000000000 --- a/arch/powerpc/kernel/systbl_chk.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - * This file, when run through CPP produces a list of syscall numbers - * in the order of systbl.h. That way we can check for gaps and syscalls - * that are out of order. - * - * Unfortunately, we cannot check for the correct ordering of entries - * using SYSX(). - * - * Copyright © IBM Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include <asm/unistd.h> - -#define SYSCALL(func) __NR_##func -#define COMPAT_SYS(func) __NR_##func -#define PPC_SYS(func) __NR_##func -#ifdef CONFIG_PPC64 -#define OLDSYS(func) -1 -#define SYS32ONLY(func) -1 -#define PPC64ONLY(func) __NR_##func -#else -#define OLDSYS(func) __NR_old##func -#define SYS32ONLY(func) __NR_##func -#define PPC64ONLY(func) -1 -#endif -#define SYSX(f, f3264, f32) -1 - -#define SYSCALL_SPU(func) SYSCALL(func) -#define COMPAT_SYS_SPU(func) COMPAT_SYS(func) -#define COMPAT_SPU_NEW(func) COMPAT_SYS(_new##func) -#define SYSX_SPU(f, f3264, f32) SYSX(f, f3264, f32) - -/* Just insert a marker for ni_syscalls */ -#define __NR_ni_syscall -1 - -/* - * These are the known exceptions. - * Hopefully, there will be no more. - */ -#define __NR_llseek __NR__llseek -#undef __NR_umount -#define __NR_umount __NR_umount2 -#define __NR_old_getrlimit __NR_getrlimit -#define __NR_newstat __NR_stat -#define __NR_newlstat __NR_lstat -#define __NR_newfstat __NR_fstat -#define __NR_newuname __NR_uname -#define __NR_sysctl __NR__sysctl -#define __NR_olddebug_setcontext __NR_sys_debug_setcontext - -/* We call sys_ugetrlimit for syscall number __NR_getrlimit */ -#define getrlimit ugetrlimit - -START_TABLE -#include <asm/systbl.h> -END_TABLE NR_syscalls diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 9a86572db1ef..00af2c4febf4 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1434,7 +1434,8 @@ void program_check_exception(struct pt_regs *regs) goto bail; } else { printk(KERN_EMERG "Unexpected TM Bad Thing exception " - "at %lx (msr 0x%lx)\n", regs->nip, regs->msr); + "at %lx (msr 0x%lx) tm_scratch=%llx\n", + regs->nip, regs->msr, get_paca()->tm_scratch); die("Unrecoverable exception", regs, SIGABRT); } } diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 65b3bdb99f0b..7725a9714736 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -671,15 +671,18 @@ static void __init vdso_setup_syscall_map(void) { unsigned int i; extern unsigned long *sys_call_table; +#ifdef CONFIG_PPC64 + extern unsigned long *compat_sys_call_table; +#endif extern unsigned long sys_ni_syscall; for (i = 0; i < NR_syscalls; i++) { #ifdef CONFIG_PPC64 - if (sys_call_table[i*2] != sys_ni_syscall) + if (sys_call_table[i] != sys_ni_syscall) vdso_data->syscall_map_64[i >> 5] |= 0x80000000UL >> (i & 0x1f); - if (sys_call_table[i*2+1] != sys_ni_syscall) + if (compat_sys_call_table[i] != sys_ni_syscall) vdso_data->syscall_map_32[i >> 5] |= 0x80000000UL >> (i & 0x1f); #else /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 62a8d03ba7e9..532ab79734c7 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -397,12 +397,13 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, return H_SUCCESS; } -static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) +static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, + unsigned long entry) { unsigned long hpa = 0; enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg(tbl, entry, &hpa, &dir); + iommu_tce_xchg(mm, tbl, entry, &hpa, &dir); } static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, @@ -433,7 +434,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, unsigned long hpa = 0; long ret; - if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) + if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir))) return H_TOO_HARD; if (dir == DMA_NONE) @@ -441,7 +442,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); if (ret != H_SUCCESS) - iommu_tce_xchg(tbl, entry, &hpa, &dir); + iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); return ret; } @@ -487,7 +488,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (mm_iommu_mapped_inc(mem)) return H_TOO_HARD; - ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); + ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); if (WARN_ON_ONCE(ret)) { mm_iommu_mapped_dec(mem); return H_TOO_HARD; @@ -566,7 +567,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, entry, ua, dir); if (ret != H_SUCCESS) { - kvmppc_clear_tce(stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); goto unlock_exit; } } @@ -655,7 +656,8 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, iommu_tce_direction(tce)); if (ret != H_SUCCESS) { - kvmppc_clear_tce(stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, + entry); goto unlock_exit; } } @@ -704,7 +706,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, return ret; WARN_ON_ONCE(1); - kvmppc_clear_tce(stit->tbl, entry); + kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); } } diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index c866d9a710fe..056b750071bb 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -226,7 +226,9 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, static bool bad_kernel_fault(bool is_exec, unsigned long error_code, unsigned long address) { - if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) { + /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */ + if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT | + DSISR_PROTFAULT))) { printk_ratelimited(KERN_CRIT "kernel tried to execute" " exec-protected page (%lx) -" "exploit attempt? (uid: %d)\n", diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 56c2234cc6ae..a712a650a8b6 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -36,6 +36,8 @@ struct mm_iommu_table_group_mem_t { u64 ua; /* userspace address */ u64 entries; /* number of entries in hpas[] */ u64 *hpas; /* vmalloc'ed */ +#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) + u64 dev_hpa; /* Device memory base address */ }; static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, @@ -126,7 +128,8 @@ static int mm_iommu_move_page_from_cma(struct page *page) return 0; } -long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, +static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, struct mm_iommu_table_group_mem_t **pmem) { struct mm_iommu_table_group_mem_t *mem; @@ -140,12 +143,6 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { - if ((mem->ua == ua) && (mem->entries == entries)) { - ++mem->used; - *pmem = mem; - goto unlock_exit; - } - /* Overlap? */ if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && (ua < (mem->ua + @@ -156,11 +153,13 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, } - ret = mm_iommu_adjust_locked_vm(mm, entries, true); - if (ret) - goto unlock_exit; + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { + ret = mm_iommu_adjust_locked_vm(mm, entries, true); + if (ret) + goto unlock_exit; - locked_entries = entries; + locked_entries = entries; + } mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) { @@ -168,6 +167,13 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, goto unlock_exit; } + if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { + mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); + mem->dev_hpa = dev_hpa; + goto good_exit; + } + mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; + /* * For a starting point for a maximum page size calculation * we use @ua and @entries natural alignment to allow IOMMU pages @@ -236,6 +242,7 @@ populate: mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; } +good_exit: atomic64_set(&mem->mapped, 1); mem->used = 1; mem->ua = ua; @@ -252,13 +259,31 @@ unlock_exit: return ret; } -EXPORT_SYMBOL_GPL(mm_iommu_get); + +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, + struct mm_iommu_table_group_mem_t **pmem) +{ + return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, + pmem); +} +EXPORT_SYMBOL_GPL(mm_iommu_new); + +long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem) +{ + return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem); +} +EXPORT_SYMBOL_GPL(mm_iommu_newdev); static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) { long i; struct page *page = NULL; + if (!mem->hpas) + return; + for (i = 0; i < mem->entries; ++i) { if (!mem->hpas[i]) continue; @@ -300,6 +325,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) { long ret = 0; + unsigned long entries, dev_hpa; mutex_lock(&mem_list_mutex); @@ -321,9 +347,12 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) } /* @mapped became 0 so now mappings are disabled, release the region */ + entries = mem->entries; + dev_hpa = mem->dev_hpa; mm_iommu_release(mem); - mm_iommu_adjust_locked_vm(mm, mem->entries, false); + if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + mm_iommu_adjust_locked_vm(mm, entries, false); unlock_exit: mutex_unlock(&mem_list_mutex); @@ -368,27 +397,32 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, return ret; } -struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, +struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries) { struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + mutex_lock(&mem_list_mutex); + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { if ((mem->ua == ua) && (mem->entries == entries)) { ret = mem; + ++mem->used; break; } } + mutex_unlock(&mem_list_mutex); + return ret; } -EXPORT_SYMBOL_GPL(mm_iommu_find); +EXPORT_SYMBOL_GPL(mm_iommu_get); long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa) { const long entry = (ua - mem->ua) >> PAGE_SHIFT; - u64 *va = &mem->hpas[entry]; + u64 *va; if (entry >= mem->entries) return -EFAULT; @@ -396,6 +430,12 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, if (pageshift > mem->pageshift) return -EFAULT; + if (!mem->hpas) { + *hpa = mem->dev_hpa + (ua - mem->ua); + return 0; + } + + va = &mem->hpas[entry]; *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); return 0; @@ -406,7 +446,6 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa) { const long entry = (ua - mem->ua) >> PAGE_SHIFT; - void *va = &mem->hpas[entry]; unsigned long *pa; if (entry >= mem->entries) @@ -415,7 +454,12 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, if (pageshift > mem->pageshift) return -EFAULT; - pa = (void *) vmalloc_to_phys(va); + if (!mem->hpas) { + *hpa = mem->dev_hpa + (ua - mem->ua); + return 0; + } + + pa = (void *) vmalloc_to_phys(&mem->hpas[entry]); if (!pa) return -EFAULT; @@ -435,6 +479,9 @@ extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) if (!mem) return; + if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) + return; + entry = (ua - mem->ua) >> PAGE_SHIFT; va = &mem->hpas[entry]; @@ -445,6 +492,33 @@ extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; } +bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, + unsigned int pageshift, unsigned long *size) +{ + struct mm_iommu_table_group_mem_t *mem; + unsigned long end; + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + continue; + + end = mem->dev_hpa + (mem->entries << PAGE_SHIFT); + if ((mem->dev_hpa <= hpa) && (hpa < end)) { + /* + * Since the IOMMU page size might be bigger than + * PAGE_SIZE, the amount of preregistered memory + * starting from @hpa might be smaller than 1<<pageshift + * and the caller needs to distinguish this situation. + */ + *size = min(1UL << pageshift, end - hpa); + return true; + } + } + + return false; +} +EXPORT_SYMBOL_GPL(mm_iommu_is_devmem); + long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) { if (atomic64_inc_not_zero(&mem->mapped)) diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c index 04b60a8f6e69..587807763737 100644 --- a/arch/powerpc/mm/pkeys.c +++ b/arch/powerpc/mm/pkeys.c @@ -415,3 +415,13 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, return pkey_access_permitted(vma_pkey(vma), write, execute); } + +void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) +{ + if (static_branch_likely(&pkey_disabled)) + return; + + /* Duplicate the oldmm pkey state in mm: */ + mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm); + mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; +} diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 383cc3640ac6..b0723002a396 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -10,6 +10,7 @@ */ #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/clock.h> #include <linux/perf_event.h> #include <linux/percpu.h> #include <linux/hardirq.h> @@ -2166,7 +2167,7 @@ static bool pmc_overflow(unsigned long val) /* * Performance monitor interrupt stuff */ -static void perf_event_interrupt(struct pt_regs *regs) +static void __perf_event_interrupt(struct pt_regs *regs) { int i, j; struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); @@ -2250,6 +2251,14 @@ static void perf_event_interrupt(struct pt_regs *regs) irq_exit(); } +static void perf_event_interrupt(struct pt_regs *regs) +{ + u64 start_clock = sched_clock(); + + __perf_event_interrupt(regs); + perf_sample_event_took(sched_clock() - start_clock); +} + static int power_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index 7e4f8ca19ce8..f467247fd1c4 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -179,9 +179,9 @@ static int pika_setup_leds(void) } for_each_child_of_node(np, child) - if (strcmp(child->name, "green") == 0) + if (of_node_name_eq(child, "green")) green_led = of_get_gpio(child, 0); - else if (strcmp(child->name, "red") == 0) + else if (of_node_name_eq(child, "red")) red_led = of_get_gpio(child, 0); of_node_put(np); diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c index f5bbd4563342..f2610a02844a 100644 --- a/arch/powerpc/platforms/4xx/ocm.c +++ b/arch/powerpc/platforms/4xx/ocm.c @@ -223,8 +223,6 @@ static void __init ocm_init_node(int count, struct device_node *node) INIT_LIST_HEAD(&ocm->c.list); ocm->ready = 1; - - return; } static int ocm_debugfs_show(struct seq_file *m, void *v) @@ -242,9 +240,7 @@ static int ocm_debugfs_show(struct seq_file *m, void *v) seq_printf(m, "PhysAddr : 0x%llx\n", ocm->phys); seq_printf(m, "MemTotal : %d Bytes\n", ocm->memtotal); seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal); - seq_printf(m, "MemTotal(C) : %d Bytes\n", ocm->c.memtotal); - - seq_printf(m, "\n"); + seq_printf(m, "MemTotal(C) : %d Bytes\n\n", ocm->c.memtotal); seq_printf(m, "NC.PhysAddr : 0x%llx\n", ocm->nc.phys); seq_printf(m, "NC.VirtAddr : 0x%p\n", ocm->nc.virt); @@ -256,9 +252,7 @@ static int ocm_debugfs_show(struct seq_file *m, void *v) blk->size, blk->owner); } - seq_printf(m, "\n"); - - seq_printf(m, "C.PhysAddr : 0x%llx\n", ocm->c.phys); + seq_printf(m, "\nC.PhysAddr : 0x%llx\n", ocm->c.phys); seq_printf(m, "C.VirtAddr : 0x%p\n", ocm->c.virt); seq_printf(m, "C.MemTotal : %d Bytes\n", ocm->c.memtotal); seq_printf(m, "C.MemFree : %d Bytes\n", ocm->c.memfree); @@ -268,7 +262,7 @@ static int ocm_debugfs_show(struct seq_file *m, void *v) blk->size, blk->owner); } - seq_printf(m, "\n"); + seq_putc(m, '\n'); } return 0; @@ -338,7 +332,6 @@ void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align, ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL); if (!ocm_blk) { - printk(KERN_ERR "PPC4XX OCM: could not allocate ocm block"); rh_free(ocm_reg->rh, offset); break; } @@ -392,10 +385,8 @@ static int __init ppc4xx_ocm_init(void) return 0; ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL); - if (!ocm_nodes) { - printk(KERN_ERR "PPC4XX OCM: failed to allocate OCM nodes!\n"); + if (!ocm_nodes) return -ENOMEM; - } ocm_count = count; count = 0; diff --git a/arch/powerpc/platforms/4xx/pci.c b/arch/powerpc/platforms/4xx/pci.c index d0825f5c5b1b..e6e2adcc7b64 100644 --- a/arch/powerpc/platforms/4xx/pci.c +++ b/arch/powerpc/platforms/4xx/pci.c @@ -1399,7 +1399,6 @@ static void __init ppc_476fpe_pciex_check_link(struct ppc4xx_pciex_port *port) printk(KERN_WARNING "PCIE%d: Link up failed\n", port->index); iounmap(mbase); - return; } static struct ppc4xx_pciex_hwops ppc_476fpe_pcie_hwops __initdata = diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c index 1ecbf176d35a..61538869e88a 100644 --- a/arch/powerpc/platforms/52xx/efika.c +++ b/arch/powerpc/platforms/52xx/efika.c @@ -82,11 +82,9 @@ static void __init efika_pcisetup(void) return; } - for (pcictrl = NULL;;) { - pcictrl = of_get_next_child(root, pcictrl); - if ((pcictrl == NULL) || (strcmp(pcictrl->name, "pci") == 0)) + for_each_child_of_node(root, pcictrl) + if (of_node_name_eq(pcictrl, "pci")) break; - } of_node_put(root); diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c index 902d527d043f..e2e1371a71e2 100644 --- a/arch/powerpc/platforms/cell/setup.c +++ b/arch/powerpc/platforms/cell/setup.c @@ -131,7 +131,7 @@ static int cell_setup_phb(struct pci_controller *phb) np = phb->dn; model = of_get_property(np, "model", NULL); - if (model == NULL || strcmp(np->name, "pci")) + if (model == NULL || !of_node_name_eq(np, "pci")) return 0; /* Setup workarounds for spider */ diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index 8ae86200ef6c..125f2a5f02de 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -34,20 +34,9 @@ */ static void *spu_syscall_table[] = { -#define SYSCALL(func) sys_ni_syscall, -#define COMPAT_SYS(func) sys_ni_syscall, -#define PPC_SYS(func) sys_ni_syscall, -#define OLDSYS(func) sys_ni_syscall, -#define SYS32ONLY(func) sys_ni_syscall, -#define PPC64ONLY(func) sys_ni_syscall, -#define SYSX(f, f3264, f32) sys_ni_syscall, - -#define SYSCALL_SPU(func) sys_##func, -#define COMPAT_SYS_SPU(func) sys_##func, -#define COMPAT_SPU_NEW(func) sys_##func, -#define SYSX_SPU(f, f3264, f32) f, - -#include <asm/systbl.h> +#define __SYSCALL(nr, entry, nargs) entry, +#include <asm/syscall_table_spu.h> +#undef __SYSCALL }; long spu_sys_callback(struct spu_syscall_block *s) diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index f7e36373f6e0..bed935c51ec2 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -458,7 +458,6 @@ static void init_affinity_node(int cbe) struct device_node *vic_dn, *last_spu_dn; phandle avoid_ph; const phandle *vic_handles; - const char *name; int lenp, i, added; last_spu = list_first_entry(&cbe_spu_info[cbe].spus, struct spu, @@ -480,12 +479,7 @@ static void init_affinity_node(int cbe) if (!vic_dn) continue; - /* a neighbour might be spe, mic-tm, or bif0 */ - name = of_get_property(vic_dn, "name", NULL); - if (!name) - continue; - - if (strcmp(name, "spe") == 0) { + if (of_node_name_eq(vic_dn, "spe") ) { spu = devnode_spu(cbe, vic_dn); avoid_ph = last_spu_dn->phandle; } else { @@ -498,7 +492,7 @@ static void init_affinity_node(int cbe) spu = neighbour_spu(cbe, vic_dn, last_spu_dn); if (!spu) continue; - if (!strcmp(name, "mic-tm")) { + if (of_node_name_eq(vic_dn, "mic-tm")) { last_spu->has_mem_affinity = 1; spu->has_mem_affinity = 1; } diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index 3c21f046e20c..e66644e0fb40 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -287,10 +287,7 @@ static __init void chrp_init(void) * or /pci@80000000/isa@C/serial@i2F8 * The optional graphics card has also type 'serial' in VGA mode. */ - property = of_get_property(node, "name", NULL); - if (!property) - goto out_put; - if (!strcmp(property, "failsafe") || !strcmp(property, "serial")) + if (of_node_name_eq(node, "failsafe") || of_node_name_eq(node, "serial")) add_preferred_console("ttyS", 0, NULL); out_put: of_node_put(node); diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index bdb3316337b6..c3e5ee8b5175 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -173,9 +173,9 @@ static long ohare_htw_scc_enable(struct device_node *node, long param, macio = macio_find(node, 0); if (!macio) return -ENODEV; - if (!strcmp(node->name, "ch-a")) + if (of_node_name_eq(node, "ch-a")) chan_mask = MACIO_FLAG_SCCA_ON; - else if (!strcmp(node->name, "ch-b")) + else if (of_node_name_eq(node, "ch-b")) chan_mask = MACIO_FLAG_SCCB_ON; else return -ENODEV; @@ -610,9 +610,9 @@ static long core99_scc_enable(struct device_node *node, long param, long value) macio = macio_find(node, 0); if (!macio) return -ENODEV; - if (!strcmp(node->name, "ch-a")) + if (of_node_name_eq(node, "ch-a")) chan_mask = MACIO_FLAG_SCCA_ON; - else if (!strcmp(node->name, "ch-b")) + else if (of_node_name_eq(node, "ch-b")) chan_mask = MACIO_FLAG_SCCB_ON; else return -ENODEV; @@ -1392,8 +1392,7 @@ static long g5_mpic_enable(struct device_node *node, long param, long value) if (parent == NULL) return 0; - is_u3 = strcmp(parent->name, "u3") == 0 || - strcmp(parent->name, "u4") == 0; + is_u3 = of_node_name_eq(parent, "u3") || of_node_name_eq(parent, "u4"); of_node_put(parent); if (!is_u3) return 0; diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 84bace3b546a..4de058a20d2b 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -617,7 +617,7 @@ static void __init kw_i2c_probe(void) * but not for now */ child = of_get_next_child(np, NULL); - multibus = !child || strcmp(child->name, "i2c-bus"); + multibus = !of_node_name_eq(child, "i2c-bus"); of_node_put(child); /* For a multibus setup, we get the bus count based on the @@ -1205,7 +1205,7 @@ static void pmac_i2c_devscan(void (*callback)(struct device_node *dev, if (bus != pmac_i2c_find_bus(np)) continue; for (p = whitelist; p->name != NULL; p++) { - if (strcmp(np->name, p->name)) + if (!of_node_name_eq(np, p->name)) continue; if (p->compatible && !of_device_is_compatible(np, p->compatible)) diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c index 04527d13d5a4..3d7420503c37 100644 --- a/arch/powerpc/platforms/powermac/pci.c +++ b/arch/powerpc/platforms/powermac/pci.c @@ -501,9 +501,7 @@ static void __init init_p2pbridge(void) /* XXX it would be better here to identify the specific PCI-PCI bridge chip we have. */ p2pbridge = of_find_node_by_name(NULL, "pci-bridge"); - if (p2pbridge == NULL - || p2pbridge->parent == NULL - || strcmp(p2pbridge->parent->name, "pci") != 0) + if (p2pbridge == NULL || !of_node_name_eq(p2pbridge->parent, "pci")) goto done; if (pci_device_from_OF_node(p2pbridge, &bus, &devfn) < 0) { DBG("Can't find PCI infos for PCI<->PCI bridge\n"); @@ -828,14 +826,14 @@ static int __init pmac_add_bridge(struct device_node *dev) if (of_device_is_compatible(dev, "uni-north")) { primary = setup_uninorth(hose, &rsrc); disp_name = "UniNorth"; - } else if (strcmp(dev->name, "pci") == 0) { + } else if (of_node_name_eq(dev, "pci")) { /* XXX assume this is a mpc106 (grackle) */ setup_grackle(hose); disp_name = "Grackle (MPC106)"; - } else if (strcmp(dev->name, "bandit") == 0) { + } else if (of_node_name_eq(dev, "bandit")) { setup_bandit(hose, &rsrc); disp_name = "Bandit"; - } else if (strcmp(dev->name, "chaos") == 0) { + } else if (of_node_name_eq(dev, "chaos")) { setup_chaos(hose, &rsrc); disp_name = "Chaos"; primary = 0; @@ -914,16 +912,14 @@ void __init pmac_pci_init(void) "of device tree\n"); return; } - for (np = NULL; (np = of_get_next_child(root, np)) != NULL;) { - if (np->name == NULL) - continue; - if (strcmp(np->name, "bandit") == 0 - || strcmp(np->name, "chaos") == 0 - || strcmp(np->name, "pci") == 0) { + for_each_child_of_node(root, np) { + if (of_node_name_eq(np, "bandit") + || of_node_name_eq(np, "chaos") + || of_node_name_eq(np, "pci")) { if (pmac_add_bridge(np) == 0) of_node_get(np); } - if (strcmp(np->name, "ht") == 0) { + if (of_node_name_eq(np, "ht")) { of_node_get(np); ht = np; } @@ -983,7 +979,7 @@ static bool pmac_pci_enable_device_hook(struct pci_dev *dev) /* Firewire & GMAC were disabled after PCI probe, the driver is * claiming them, we must re-enable them now. */ - if (uninorth_child && !strcmp(node->name, "firewire") && + if (uninorth_child && of_node_name_eq(node, "firewire") && (of_device_is_compatible(node, "pci106b,18") || of_device_is_compatible(node, "pci106b,30") || of_device_is_compatible(node, "pci11c1,5811"))) { @@ -991,7 +987,7 @@ static bool pmac_pci_enable_device_hook(struct pci_dev *dev) pmac_call_feature(PMAC_FTR_1394_ENABLE, node, 0, 1); updatecfg = 1; } - if (uninorth_child && !strcmp(node->name, "ethernet") && + if (uninorth_child && of_node_name_eq(node, "ethernet") && of_device_is_compatible(node, "gmac")) { pmac_call_feature(PMAC_FTR_GMAC_ENABLE, node, 0, 1); updatecfg = 1; @@ -1262,4 +1258,3 @@ struct pci_controller_ops pmac_pci_controller_ops = { .enable_device_hook = pmac_pci_enable_device_hook, #endif }; - diff --git a/arch/powerpc/platforms/powermac/pfunc_base.c b/arch/powerpc/platforms/powermac/pfunc_base.c index fd2e210559c8..62311e84a423 100644 --- a/arch/powerpc/platforms/powermac/pfunc_base.c +++ b/arch/powerpc/platforms/powermac/pfunc_base.c @@ -101,9 +101,8 @@ static void macio_gpio_init_one(struct macio_chip *macio) * Find the "gpio" parent node */ - for (gparent = NULL; - (gparent = of_get_next_child(macio->of_node, gparent)) != NULL;) - if (strcmp(gparent->name, "gpio") == 0) + for_each_child_of_node(macio->of_node, gparent) + if (of_node_name_eq(gparent, "gpio")) break; if (gparent == NULL) return; @@ -313,7 +312,7 @@ static void uninorth_install_pfunc(void) * Install handlers for the hwclock child if any */ for (np = NULL; (np = of_get_next_child(uninorth_node, np)) != NULL;) - if (strcmp(np->name, "hw-clock") == 0) { + if (of_node_name_eq(np, "hw-clock")) { unin_hwclock = np; break; } diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 5a71395e535e..c292ffac2ed4 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -553,13 +553,13 @@ void __init pmac_pic_init(void) for_each_node_with_property(np, "interrupt-controller") { /* Skip /chosen/interrupt-controller */ - if (strcmp(np->name, "chosen") == 0) + if (of_node_name_eq(np, "chosen")) continue; /* It seems like at least one person wants * to use BootX on a machine with an AppleKiwi * controller which happens to pretend to be an * interrupt controller too. */ - if (strcmp(np->name, "AppleKiwi") == 0) + if (of_node_name_eq(np, "AppleKiwi")) continue; /* I think we found one ! */ of_irq_dflt_pic = np; diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 2f00e3daafb0..2e8221e20ee8 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -560,15 +560,9 @@ static int __init check_pmac_serial_console(void) } pr_debug("stdout is %pOF\n", prom_stdout); - name = of_get_property(prom_stdout, "name", NULL); - if (!name) { - pr_debug(" stdout package has no name !\n"); - goto not_found; - } - - if (strcmp(name, "ch-a") == 0) + if (of_node_name_eq(prom_stdout, "ch-a")) offset = 0; - else if (strcmp(name, "ch-b") == 0) + else if (of_node_name_eq(prom_stdout, "ch-b")) offset = 1; else goto not_found; diff --git a/arch/powerpc/platforms/powermac/udbg_scc.c b/arch/powerpc/platforms/powermac/udbg_scc.c index 8901973ed683..415b74d7c253 100644 --- a/arch/powerpc/platforms/powermac/udbg_scc.c +++ b/arch/powerpc/platforms/powermac/udbg_scc.c @@ -87,7 +87,7 @@ void udbg_scc_init(int force_scc) for (ch = NULL; (ch = of_get_next_child(escc, ch)) != NULL;) { if (ch == stdout) ch_def = of_node_get(ch); - if (strcmp(ch->name, "ch-a") == 0) + if (of_node_name_eq(ch, "ch-a")) ch_a = of_node_get(ch); } if (ch_def == NULL && !force_scc) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index abc0be7507c8..f38078976c5d 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -564,8 +564,8 @@ static void pnv_eeh_get_phb_diag(struct eeh_pe *pe) static int pnv_eeh_get_phb_state(struct eeh_pe *pe) { struct pnv_phb *phb = pe->phb->private_data; - u8 fstate; - __be16 pcierr; + u8 fstate = 0; + __be16 pcierr = 0; s64 rc; int result = 0; @@ -603,8 +603,8 @@ static int pnv_eeh_get_phb_state(struct eeh_pe *pe) static int pnv_eeh_get_pe_state(struct eeh_pe *pe) { struct pnv_phb *phb = pe->phb->private_data; - u8 fstate; - __be16 pcierr; + u8 fstate = 0; + __be16 pcierr = 0; s64 rc; int result; diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 75b935252981..d7f742ed48ba 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -9,32 +9,19 @@ * License as published by the Free Software Foundation. */ -#include <linux/slab.h> #include <linux/mmu_notifier.h> #include <linux/mmu_context.h> #include <linux/of.h> -#include <linux/export.h> #include <linux/pci.h> #include <linux/memblock.h> -#include <linux/iommu.h> #include <linux/sizes.h> #include <asm/debugfs.h> -#include <asm/tlb.h> #include <asm/powernv.h> -#include <asm/reg.h> -#include <asm/opal.h> -#include <asm/io.h> -#include <asm/iommu.h> -#include <asm/pnv-pci.h> -#include <asm/msi_bitmap.h> #include <asm/opal.h> -#include "powernv.h" #include "pci.h" -#define npu_to_phb(x) container_of(x, struct pnv_phb, npu) - /* * spinlock to protect initialisation of an npu_context for a particular * mm_struct. @@ -133,15 +120,25 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, return pe; } -long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, +static long pnv_npu_unset_window(struct iommu_table_group *table_group, + int num); + +static long pnv_npu_set_window(struct iommu_table_group *table_group, int num, struct iommu_table *tbl) { + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); struct pnv_phb *phb = npe->phb; int64_t rc; const unsigned long size = tbl->it_indirect_levels ? tbl->it_level_size : tbl->it_size; const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; const __u64 win_size = tbl->it_size << tbl->it_page_shift; + int num2 = (num == 0) ? 1 : 0; + + /* NPU has just one TVE so if there is another table, remove it first */ + if (npe->table_group.tables[num2]) + pnv_npu_unset_window(&npe->table_group, num2); pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", start_addr, start_addr + win_size - 1, @@ -167,11 +164,16 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, return 0; } -long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) +static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) { + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); struct pnv_phb *phb = npe->phb; int64_t rc; + if (!npe->table_group.tables[num]) + return 0; + pe_info(npe, "Removing DMA window\n"); rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, @@ -210,7 +212,8 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) if (!gpe) return; - rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]); + rc = pnv_npu_set_window(&npe->table_group, 0, + gpe->table_group.tables[0]); /* * NVLink devices use the same TCE table configuration as @@ -235,7 +238,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev) return -EINVAL; - rc = pnv_npu_unset_window(npe, 0); + rc = pnv_npu_unset_window(&npe->table_group, 0); if (rc != OPAL_SUCCESS) return rc; @@ -288,11 +291,15 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) } } +#ifdef CONFIG_IOMMU_API /* Switch ownership from platform code to external user (e.g. VFIO) */ -void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) +static void pnv_npu_take_ownership(struct iommu_table_group *table_group) { + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); struct pnv_phb *phb = npe->phb; int64_t rc; + struct pci_dev *gpdev = NULL; /* * Note: NPU has just a single TVE in the hardware which means that @@ -301,7 +308,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) * if it was enabled at the moment of ownership change. */ if (npe->table_group.tables[0]) { - pnv_npu_unset_window(npe, 0); + pnv_npu_unset_window(&npe->table_group, 0); return; } @@ -314,30 +321,315 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) return; } pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); + + get_gpu_pci_dev_and_pe(npe, &gpdev); + if (gpdev) + pnv_npu2_unmap_lpar_dev(gpdev); } -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) +static void pnv_npu_release_ownership(struct iommu_table_group *table_group) { - struct pnv_phb *phb = npe->phb; - struct pci_bus *pbus = phb->hose->bus; - struct pci_dev *npdev, *gpdev = NULL, *gptmp; - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pci_dev *gpdev = NULL; + + get_gpu_pci_dev_and_pe(npe, &gpdev); + if (gpdev) + pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV); +} + +static struct iommu_table_group_ops pnv_pci_npu_ops = { + .set_window = pnv_npu_set_window, + .unset_window = pnv_npu_unset_window, + .take_ownership = pnv_npu_take_ownership, + .release_ownership = pnv_npu_release_ownership, +}; +#endif /* !CONFIG_IOMMU_API */ + +/* + * NPU2 ATS + */ +/* Maximum possible number of ATSD MMIO registers per NPU */ +#define NV_NMMU_ATSD_REGS 8 +#define NV_NPU_MAX_PE_NUM 16 + +/* + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or + * up to 3 x (GPU + 2xNPUs) (POWER9). + */ +struct npu_comp { + struct iommu_table_group table_group; + int pe_num; + struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM]; +}; + +/* An NPU descriptor, valid for POWER9 only */ +struct npu { + int index; + __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; + unsigned int mmio_atsd_count; + + /* Bitmask for MMIO register usage */ + unsigned long mmio_atsd_usage; + + /* Do we need to explicitly flush the nest mmu? */ + bool nmmu_flush; + + struct npu_comp npucomp; +}; + +#ifdef CONFIG_IOMMU_API +static long pnv_npu_peers_create_table_userspace( + struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + if (!npucomp->pe_num || !npucomp->pe[0] || + !npucomp->pe[0]->table_group.ops || + !npucomp->pe[0]->table_group.ops->create_table) + return -EFAULT; + + return npucomp->pe[0]->table_group.ops->create_table( + &npucomp->pe[0]->table_group, num, page_shift, + window_size, levels, ptbl); +} + +static long pnv_npu_peers_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + int i, j; + long ret = 0; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops->set_window) + continue; + + ret = pe->table_group.ops->set_window(&pe->table_group, + num, tbl); + if (ret) + break; + } + + if (ret) { + for (j = 0; j < i; ++j) { + struct pnv_ioda_pe *pe = npucomp->pe[j]; + + if (!pe->table_group.ops->unset_window) + continue; + + ret = pe->table_group.ops->unset_window( + &pe->table_group, num); + if (ret) + break; + } + } else { + table_group->tables[num] = iommu_tce_table_get(tbl); + } + + return ret; +} - if (!gpe || !gpdev) +static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group, + int num) +{ + int i, j; + long ret = 0; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + WARN_ON(npucomp->table_group.tables[num] != + table_group->tables[num]); + if (!npucomp->table_group.tables[num]) + continue; + + if (!pe->table_group.ops->unset_window) + continue; + + ret = pe->table_group.ops->unset_window(&pe->table_group, num); + if (ret) + break; + } + + if (ret) { + for (j = 0; j < i; ++j) { + struct pnv_ioda_pe *pe = npucomp->pe[j]; + + if (!npucomp->table_group.tables[num]) + continue; + + if (!pe->table_group.ops->set_window) + continue; + + ret = pe->table_group.ops->set_window(&pe->table_group, + num, table_group->tables[num]); + if (ret) + break; + } + } else if (table_group->tables[num]) { + iommu_tce_table_put(table_group->tables[num]); + table_group->tables[num] = NULL; + } + + return ret; +} + +static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group) +{ + int i; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops->take_ownership) + continue; + pe->table_group.ops->take_ownership(&pe->table_group); + } +} + +static void pnv_npu_peers_release_ownership( + struct iommu_table_group *table_group) +{ + int i; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops->release_ownership) + continue; + pe->table_group.ops->release_ownership(&pe->table_group); + } +} + +static struct iommu_table_group_ops pnv_npu_peers_ops = { + .get_table_size = pnv_pci_ioda2_get_table_size, + .create_table = pnv_npu_peers_create_table_userspace, + .set_window = pnv_npu_peers_set_window, + .unset_window = pnv_npu_peers_unset_window, + .take_ownership = pnv_npu_peers_take_ownership, + .release_ownership = pnv_npu_peers_release_ownership, +}; + +static void pnv_comp_attach_table_group(struct npu_comp *npucomp, + struct pnv_ioda_pe *pe) +{ + if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM)) + return; + + npucomp->pe[npucomp->pe_num] = pe; + ++npucomp->pe_num; +} + +struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) +{ + struct iommu_table_group *table_group; + struct npu_comp *npucomp; + struct pci_dev *gpdev = NULL; + struct pci_controller *hose; + struct pci_dev *npdev = NULL; + + list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) { + npdev = pnv_pci_get_npu_dev(gpdev, 0); + if (npdev) + break; + } + + if (!npdev) + /* It is not an NPU attached device, skip */ + return NULL; + + hose = pci_bus_to_host(npdev->bus); + + if (hose->npu) { + table_group = &hose->npu->npucomp.table_group; + + if (!table_group->group) { + table_group->ops = &pnv_npu_peers_ops; + iommu_register_group(table_group, + hose->global_number, + pe->pe_number); + } + } else { + /* Create a group for 1 GPU and attached NPUs for POWER8 */ + pe->npucomp = kzalloc(sizeof(pe->npucomp), GFP_KERNEL); + table_group = &pe->npucomp->table_group; + table_group->ops = &pnv_npu_peers_ops; + iommu_register_group(table_group, hose->global_number, + pe->pe_number); + } + + /* Steal capabilities from a GPU PE */ + table_group->max_dynamic_windows_supported = + pe->table_group.max_dynamic_windows_supported; + table_group->tce32_start = pe->table_group.tce32_start; + table_group->tce32_size = pe->table_group.tce32_size; + table_group->max_levels = pe->table_group.max_levels; + if (!table_group->pgsizes) + table_group->pgsizes = pe->table_group.pgsizes; + + npucomp = container_of(table_group, struct npu_comp, table_group); + pnv_comp_attach_table_group(npucomp, pe); + + return table_group; +} + +struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) +{ + struct iommu_table_group *table_group; + struct npu_comp *npucomp; + struct pci_dev *gpdev = NULL; + struct pci_dev *npdev; + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev); + + WARN_ON(!(pe->flags & PNV_IODA_PE_DEV)); + if (!gpe) return NULL; - list_for_each_entry(npdev, &pbus->devices, bus_list) { - gptmp = pnv_pci_get_gpu_dev(npdev); + /* + * IODA2 bridges get this set up from pci_controller_ops::setup_bridge + * but NPU bridges do not have this hook defined so we do it here. + * We do not setup other table group parameters as they won't be used + * anyway - NVLink bridges are subordinate PEs. + */ + pe->table_group.ops = &pnv_pci_npu_ops; + + table_group = iommu_group_get_iommudata( + iommu_group_get(&gpdev->dev)); + + /* + * On P9 NPU PHB and PCI PHB support different page sizes, + * keep only matching. We expect here that NVLink bridge PE pgsizes is + * initialized by the caller. + */ + table_group->pgsizes &= pe->table_group.pgsizes; + npucomp = container_of(table_group, struct npu_comp, table_group); + pnv_comp_attach_table_group(npucomp, pe); + + list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) { + struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev); - if (gptmp != gpdev) + if (gpdevtmp != gpdev) continue; - pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev)); - iommu_group_add_device(gpe->table_group.group, &npdev->dev); + iommu_add_device(table_group, &npdev->dev); } - return gpe; + return table_group; } +#endif /* CONFIG_IOMMU_API */ /* Maximum number of nvlinks per npu */ #define NV_MAX_LINKS 6 @@ -490,7 +782,6 @@ static void acquire_atsd_reg(struct npu_context *npu_context, int i, j; struct npu *npu; struct pci_dev *npdev; - struct pnv_phb *nphb; for (i = 0; i <= max_npu2_index; i++) { mmio_atsd_reg[i].reg = -1; @@ -505,8 +796,10 @@ static void acquire_atsd_reg(struct npu_context *npu_context, if (!npdev) continue; - nphb = pci_bus_to_host(npdev->bus)->private_data; - npu = &nphb->npu; + npu = pci_bus_to_host(npdev->bus)->npu; + if (!npu) + continue; + mmio_atsd_reg[i].npu = npu; mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); while (mmio_atsd_reg[i].reg < 0) { @@ -671,9 +964,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, u32 nvlink_index; struct device_node *nvlink_dn; struct mm_struct *mm = current->mm; - struct pnv_phb *nphb; struct npu *npu; struct npu_context *npu_context; + struct pci_controller *hose; /* * At present we don't support GPUs connected to multiple NPUs and I'm @@ -681,13 +974,14 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, */ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); - if (!firmware_has_feature(FW_FEATURE_OPAL)) - return ERR_PTR(-ENODEV); - if (!npdev) /* No nvlink associated with this GPU device */ return ERR_PTR(-ENODEV); + /* We only support DR/PR/HV in pnv_npu2_map_lpar_dev() */ + if (flags & ~(MSR_DR | MSR_PR | MSR_HV)) + return ERR_PTR(-EINVAL); + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", &nvlink_index))) @@ -701,20 +995,10 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, return ERR_PTR(-EINVAL); } - nphb = pci_bus_to_host(npdev->bus)->private_data; - npu = &nphb->npu; - - /* - * Setup the NPU context table for a particular GPU. These need to be - * per-GPU as we need the tables to filter ATSDs when there are no - * active contexts on a particular GPU. It is safe for these to be - * called concurrently with destroy as the OPAL call takes appropriate - * locks and refcounts on init/destroy. - */ - rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); - if (rc < 0) - return ERR_PTR(-ENOSPC); + hose = pci_bus_to_host(npdev->bus); + npu = hose->npu; + if (!npu) + return ERR_PTR(-ENODEV); /* * We store the npu pci device so we can more easily get at the @@ -726,9 +1010,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, if (npu_context->release_cb != cb || npu_context->priv != priv) { spin_unlock(&npu_context_lock); - opal_npu_destroy_context(nphb->opal_id, mm->context.id, - PCI_DEVID(gpdev->bus->number, - gpdev->devfn)); return ERR_PTR(-EINVAL); } @@ -754,9 +1035,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, if (rc) { kfree(npu_context); - opal_npu_destroy_context(nphb->opal_id, mm->context.id, - PCI_DEVID(gpdev->bus->number, - gpdev->devfn)); return ERR_PTR(rc); } @@ -776,7 +1054,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, */ WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); - if (!nphb->npu.nmmu_flush) { + if (!npu->nmmu_flush) { /* * If we're not explicitly flushing ourselves we need to mark * the thread for global flushes @@ -809,27 +1087,24 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context, struct pci_dev *gpdev) { int removed; - struct pnv_phb *nphb; struct npu *npu; struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); struct device_node *nvlink_dn; u32 nvlink_index; + struct pci_controller *hose; if (WARN_ON(!npdev)) return; - if (!firmware_has_feature(FW_FEATURE_OPAL)) + hose = pci_bus_to_host(npdev->bus); + npu = hose->npu; + if (!npu) return; - - nphb = pci_bus_to_host(npdev->bus)->private_data; - npu = &nphb->npu; nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", &nvlink_index))) return; WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL); - opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id, - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); spin_lock(&npu_context_lock); removed = kref_put(&npu_context->kref, pnv_npu2_release_context); spin_unlock(&npu_context_lock); @@ -857,13 +1132,12 @@ int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, u64 rc = 0, result = 0; int i, is_write; struct page *page[1]; + const char __user *u; + char c; /* mmap_sem should be held so the struct_mm must be present */ struct mm_struct *mm = context->mm; - if (!firmware_has_feature(FW_FEATURE_OPAL)) - return -ENODEV; - WARN_ON(!rwsem_is_locked(&mm->mmap_sem)); for (i = 0; i < count; i++) { @@ -872,18 +1146,17 @@ int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, is_write ? FOLL_WRITE : 0, page, NULL, NULL); - /* - * To support virtualised environments we will have to do an - * access to the page to ensure it gets faulted into the - * hypervisor. For the moment virtualisation is not supported in - * other areas so leave the access out. - */ if (rc != 1) { status[i] = rc; result = -EFAULT; continue; } + /* Make sure partition scoped tree gets a pte */ + u = page_address(page[0]); + if (__get_user(c, u)) + result = -EFAULT; + status[i] = 0; put_page(page[0]); } @@ -892,42 +1165,127 @@ int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, } EXPORT_SYMBOL(pnv_npu2_handle_fault); -int pnv_npu2_init(struct pnv_phb *phb) +int pnv_npu2_init(struct pci_controller *hose) { unsigned int i; u64 mmio_atsd; - struct device_node *dn; - struct pci_dev *gpdev; static int npu_index; - uint64_t rc = 0; - - phb->npu.nmmu_flush = - of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); - for_each_child_of_node(phb->hose->dn, dn) { - gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); - if (gpdev) { - rc = opal_npu_map_lpar(phb->opal_id, - PCI_DEVID(gpdev->bus->number, gpdev->devfn), - 0, 0); - if (rc) - dev_err(&gpdev->dev, - "Error %lld mapping device to LPAR\n", - rc); - } - } + struct npu *npu; + int ret; + + npu = kzalloc(sizeof(*npu), GFP_KERNEL); + if (!npu) + return -ENOMEM; - for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd", - i, &mmio_atsd); i++) - phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); + npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush"); - pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i); - phb->npu.mmio_atsd_count = i; - phb->npu.mmio_atsd_usage = 0; + for (i = 0; i < ARRAY_SIZE(npu->mmio_atsd_regs) && + !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", + i, &mmio_atsd); i++) + npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); + + pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i); + npu->mmio_atsd_count = i; + npu->mmio_atsd_usage = 0; npu_index++; - if (WARN_ON(npu_index >= NV_MAX_NPUS)) - return -ENOSPC; + if (WARN_ON(npu_index >= NV_MAX_NPUS)) { + ret = -ENOSPC; + goto fail_exit; + } max_npu2_index = npu_index; - phb->npu.index = npu_index; + npu->index = npu_index; + hose->npu = npu; + + return 0; + +fail_exit: + for (i = 0; i < npu->mmio_atsd_count; ++i) + iounmap(npu->mmio_atsd_regs[i]); + + kfree(npu); + + return ret; +} + +int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, + unsigned long msr) +{ + int ret; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct pci_controller *hose; + struct pnv_phb *nphb; + + if (!npdev) + return -ENODEV; + + hose = pci_bus_to_host(npdev->bus); + nphb = hose->private_data; + + dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n", + nphb->opal_id, lparid); + /* + * Currently we only support radix and non-zero LPCR only makes sense + * for hash tables so skiboot expects the LPCR parameter to be a zero. + */ + ret = opal_npu_map_lpar(nphb->opal_id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn), lparid, + 0 /* LPCR bits */); + if (ret) { + dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); + return ret; + } + + dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n", + nphb->opal_id, msr); + ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + if (ret < 0) + dev_err(&gpdev->dev, "Failed to init context: %d\n", ret); + else + ret = 0; return 0; } +EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev); + +void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr) +{ + struct pci_dev *gpdev; + + list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list) + pnv_npu2_map_lpar_dev(gpdev, 0, msr); +} + +int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev) +{ + int ret; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct pci_controller *hose; + struct pnv_phb *nphb; + + if (!npdev) + return -ENODEV; + + hose = pci_bus_to_host(npdev->bus); + nphb = hose->private_data; + + dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n", + nphb->opal_id); + ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + if (ret < 0) { + dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret); + return ret; + } + + /* Set LPID to 0 anyway, just to be safe */ + dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id); + ret = opal_npu_map_lpar(nphb->opal_id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn), 0 /*LPID*/, + 0 /* LPCR bits */); + if (ret) + dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); + + return ret; +} +EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev); diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c index 58dc3308237f..89ab1da57657 100644 --- a/arch/powerpc/platforms/powernv/opal-power.c +++ b/arch/powerpc/platforms/powernv/opal-power.c @@ -138,7 +138,7 @@ static struct notifier_block opal_power_control_nb = { .priority = 0, }; -static int __init opal_power_control_init(void) +int __init opal_power_control_init(void) { int ret, supported = 0; struct device_node *np; @@ -176,4 +176,3 @@ static int __init opal_power_control_init(void) return 0; } -machine_subsys_initcall(powernv, opal_power_control_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index beed86f4224b..79586f127521 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -877,7 +877,7 @@ static int __init opal_init(void) consoles = of_find_node_by_path("/ibm,opal/consoles"); if (consoles) { for_each_child_of_node(consoles, np) { - if (strcmp(np->name, "serial")) + if (!of_node_name_eq(np, "serial")) continue; of_platform_device_create(np, NULL, NULL); } @@ -960,6 +960,9 @@ static int __init opal_init(void) /* Initialise OPAL sensor groups */ opal_sensor_groups_init(); + /* Initialise OPAL Power control interface */ + opal_power_control_init(); + return 0; } machine_subsys_initcall(powernv, opal_init); diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index fe9691040f54..697449afb3f7 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -299,7 +299,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (alloc_userspace_copy) { offset = 0; uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - levels, tce_table_size, &offset, + tmplevels, tce_table_size, &offset, &total_allocated_uas); if (!uas) goto free_tces_exit; @@ -368,6 +368,7 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, found = false; for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { if (table_group->tables[i] == tbl) { + iommu_tce_table_put(tbl); table_group->tables[i] = NULL; found = true; break; @@ -393,7 +394,7 @@ long pnv_pci_link_table_and_group(int node, int num, tgl->table_group = table_group; list_add_rcu(&tgl->next, &tbl->it_group_list); - table_group->tables[num] = tbl; + table_group->tables[num] = iommu_tce_table_get(tbl); return 0; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index dd807446801e..1d6406a051f1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -190,7 +190,8 @@ static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) unsigned int pe_num = pe->pe_number; WARN_ON(pe->pdev); - + WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */ + kfree(pe->npucomp); memset(pe, 0, sizeof(struct pnv_ioda_pe)); clear_bit(pe_num, phb->ioda.pe_alloc); } @@ -517,8 +518,6 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) phb->init_m64 = pnv_ioda1_init_m64; else phb->init_m64 = pnv_ioda2_init_m64; - phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe; - phb->pick_m64_pe = pnv_ioda_pick_m64_pe; } static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) @@ -604,8 +603,8 @@ static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt) static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) { struct pnv_ioda_pe *slave, *pe; - u8 fstate, state; - __be16 pcierr; + u8 fstate = 0, state; + __be16 pcierr = 0; s64 rc; /* Sanity check on PE number */ @@ -663,10 +662,6 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) return state; } -/* Currently those 2 are only used when MSIs are enabled, this will change - * but in the meantime, we need to protect them to avoid warnings - */ -#ifdef CONFIG_PCI_MSI struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); @@ -679,7 +674,6 @@ struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) return NULL; return &phb->ioda.pe_array[pdn->pe_number]; } -#endif /* CONFIG_PCI_MSI */ static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, struct pnv_ioda_pe *parent, @@ -1160,8 +1154,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx]; /* Check if PE is determined by M64 */ - if (!pe && phb->pick_m64_pe) - pe = phb->pick_m64_pe(bus, all); + if (!pe) + pe = pnv_ioda_pick_m64_pe(bus, all); /* The PE number isn't pinned by M64 */ if (!pe) @@ -1273,19 +1267,20 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) static void pnv_pci_ioda_setup_PEs(void) { - struct pci_controller *hose, *tmp; + struct pci_controller *hose; struct pnv_phb *phb; struct pci_bus *bus; struct pci_dev *pdev; + struct pnv_ioda_pe *pe; - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; if (phb->type == PNV_PHB_NPU_NVLINK) { /* PE#0 is needed for error reporting */ pnv_ioda_reserve_pe(phb, 0); pnv_ioda_setup_npu_PEs(hose->bus); if (phb->model == PNV_PHB_MODEL_NPU2) - pnv_npu2_init(phb); + WARN_ON_ONCE(pnv_npu2_init(hose)); } if (phb->type == PNV_PHB_NPU_OCAPI) { bus = hose->bus; @@ -1293,6 +1288,14 @@ static void pnv_pci_ioda_setup_PEs(void) pnv_ioda_setup_dev_PE(pdev); } } + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + if (phb->type != PNV_PHB_IODA2) + continue; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) + pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV); + } } #ifdef CONFIG_PCI_IOV @@ -1531,6 +1534,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); +#ifdef CONFIG_IOMMU_API +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, + struct iommu_table_group *table_group, struct pci_bus *bus); + +#endif static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) { struct pci_bus *bus; @@ -1584,6 +1592,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) mutex_unlock(&phb->ioda.pe_list_mutex); pnv_pci_ioda2_setup_dma_pe(phb, pe); +#ifdef CONFIG_IOMMU_API + pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL); +#endif } } @@ -1923,21 +1934,16 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev) return mask; } -static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, - struct pci_bus *bus, - bool add_to_group) +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); set_dma_offset(&dev->dev, pe->tce_bypass_base); - if (add_to_group) - iommu_add_device(&dev->dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate, - add_to_group); + pnv_ioda_setup_bus_dma(pe, dev->subordinate); } } @@ -2366,16 +2372,8 @@ found: pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; iommu_init_table(tbl, phb->hose->node); - if (pe->flags & PNV_IODA_PE_DEV) { - /* - * Setting table base here only for carrying iommu_group - * further down to let iommu_add_device() do the job. - * pnv_pci_ioda_dma_dev_setup will override it later anyway. - */ - set_iommu_table_base(&pe->pdev->dev, tbl); - iommu_add_device(&pe->pdev->dev); - } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + pnv_ioda_setup_bus_dma(pe, pe->pbus); return; fail: @@ -2527,14 +2525,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) if (!pnv_iommu_bypass_disabled) pnv_pci_ioda2_set_bypass(pe, true); - /* - * Setting table base here only for carrying iommu_group - * further down to let iommu_add_device() do the job. - * pnv_pci_ioda_dma_dev_setup will override it later anyway. - */ - if (pe->flags & PNV_IODA_PE_DEV) - set_iommu_table_base(&pe->pdev->dev, tbl); - return 0; } @@ -2565,7 +2555,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, #endif #ifdef CONFIG_IOMMU_API -static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, +unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels) { unsigned long bytes = 0; @@ -2616,7 +2606,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) pnv_pci_ioda2_set_bypass(pe, false); pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (pe->pbus) - pnv_ioda_setup_bus_dma(pe, pe->pbus, false); + pnv_ioda_setup_bus_dma(pe, pe->pbus); iommu_tce_table_put(tbl); } @@ -2627,7 +2617,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) pnv_pci_ioda2_setup_default_config(pe); if (pe->pbus) - pnv_ioda_setup_bus_dma(pe, pe->pbus, false); + pnv_ioda_setup_bus_dma(pe, pe->pbus); } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { @@ -2639,131 +2629,100 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .release_ownership = pnv_ioda2_release_ownership, }; -static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque) +static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe, + struct iommu_table_group *table_group, + struct pci_bus *bus) { - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe **ptmppe = opaque; - struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); - struct pci_dn *pdn = pci_get_pdn(pdev); - - if (!pdn || pdn->pe_number == IODA_INVALID_PE) - return 0; - - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - if (phb->type != PNV_PHB_NPU_NVLINK) - return 0; + struct pci_dev *dev; - *ptmppe = &phb->ioda.pe_array[pdn->pe_number]; + list_for_each_entry(dev, &bus->devices, bus_list) { + iommu_add_device(table_group, &dev->dev); - return 1; + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) + pnv_ioda_setup_bus_iommu_group_add_devices(pe, + table_group, dev->subordinate); + } } -/* - * This returns PE of associated NPU. - * This assumes that NPU is in the same IOMMU group with GPU and there is - * no other PEs. - */ -static struct pnv_ioda_pe *gpe_table_group_to_npe( - struct iommu_table_group *table_group) +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, + struct iommu_table_group *table_group, struct pci_bus *bus) { - struct pnv_ioda_pe *npe = NULL; - int ret = iommu_group_for_each_dev(table_group->group, &npe, - gpe_table_group_to_npe_cb); - BUG_ON(!ret || !npe); + if (pe->flags & PNV_IODA_PE_DEV) + iommu_add_device(table_group, &pe->pdev->dev); - return npe; + if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus) + pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group, + bus); } -static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group, - int num, struct iommu_table *tbl) -{ - struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group); - int num2 = (num == 0) ? 1 : 0; - long ret = pnv_pci_ioda2_set_window(table_group, num, tbl); - - if (ret) - return ret; - - if (table_group->tables[num2]) - pnv_npu_unset_window(npe, num2); - - ret = pnv_npu_set_window(npe, num, tbl); - if (ret) { - pnv_pci_ioda2_unset_window(table_group, num); - if (table_group->tables[num2]) - pnv_npu_set_window(npe, num2, - table_group->tables[num2]); - } - - return ret; -} +static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); -static long pnv_pci_ioda2_npu_unset_window( - struct iommu_table_group *table_group, - int num) +static void pnv_pci_ioda_setup_iommu_api(void) { - struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group); - int num2 = (num == 0) ? 1 : 0; - long ret = pnv_pci_ioda2_unset_window(table_group, num); - - if (ret) - return ret; - - if (!npe->table_group.tables[num]) - return 0; - - ret = pnv_npu_unset_window(npe, num); - if (ret) - return ret; - - if (table_group->tables[num2]) - ret = pnv_npu_set_window(npe, num2, table_group->tables[num2]); - - return ret; -} + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; -static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) -{ /* - * Detach NPU first as pnv_ioda2_take_ownership() will destroy - * the iommu_table if 32bit DMA is enabled. + * There are 4 types of PEs: + * - PNV_IODA_PE_BUS: a downstream port with an adapter, + * created from pnv_pci_setup_bridge(); + * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it, + * created from pnv_pci_setup_bridge(); + * - PNV_IODA_PE_VF: a SRIOV virtual function, + * created from pnv_pcibios_sriov_enable(); + * - PNV_IODA_PE_DEV: an NPU or OCAPI device, + * created from pnv_pci_ioda_fixup(). + * + * Normally a PE is represented by an IOMMU group, however for + * devices with side channels the groups need to be more strict. */ - pnv_npu_take_ownership(gpe_table_group_to_npe(table_group)); - pnv_ioda2_take_ownership(table_group); -} + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; -static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { - .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table_userspace, - .set_window = pnv_pci_ioda2_npu_set_window, - .unset_window = pnv_pci_ioda2_npu_unset_window, - .take_ownership = pnv_ioda2_npu_take_ownership, - .release_ownership = pnv_ioda2_release_ownership, -}; + if (phb->type == PNV_PHB_NPU_NVLINK) + continue; -static void pnv_pci_ioda_setup_iommu_api(void) -{ - struct pci_controller *hose, *tmp; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe, *gpe; + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + struct iommu_table_group *table_group; + + table_group = pnv_try_setup_npu_table_group(pe); + if (!table_group) { + if (!pnv_pci_ioda_pe_dma_weight(pe)) + continue; + + table_group = &pe->table_group; + iommu_register_group(&pe->table_group, + pe->phb->hose->global_number, + pe->pe_number); + } + pnv_ioda_setup_bus_iommu_group(pe, table_group, + pe->pbus); + } + } /* * Now we have all PHBs discovered, time to add NPU devices to * the corresponding IOMMU groups. */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + list_for_each_entry(hose, &hose_list, list_node) { + unsigned long pgsizes; + phb = hose->private_data; if (phb->type != PNV_PHB_NPU_NVLINK) continue; + pgsizes = pnv_ioda_parse_tce_sizes(phb); list_for_each_entry(pe, &phb->ioda.pe_list, list) { - gpe = pnv_pci_npu_setup_iommu(pe); - if (gpe) - gpe->table_group.ops = &pnv_pci_ioda2_npu_ops; + /* + * IODA2 bridges get this set up from + * pci_controller_ops::setup_bridge but NPU bridges + * do not have this hook defined so we do it here. + */ + pe->table_group.pgsizes = pgsizes; + pnv_npu_compound_attach(pe); } } } @@ -2810,9 +2769,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; - iommu_register_group(&pe->table_group, phb->hose->global_number, - pe->pe_number); - /* The PE will reserve all possible 32-bits space */ pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", phb->ioda.m32_pci_base); @@ -2833,10 +2789,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, return; if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + pnv_ioda_setup_bus_dma(pe, pe->pbus); } -#ifdef CONFIG_PCI_MSI int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) { struct pnv_phb *phb = container_of(chip, struct pnv_phb, @@ -2982,9 +2937,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); } -#else -static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } -#endif /* CONFIG_PCI_MSI */ #ifdef CONFIG_PCI_IOV static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) @@ -3402,8 +3354,7 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type) return; /* Reserve PEs according to used M64 resources */ - if (phb->reserve_m64_pe) - phb->reserve_m64_pe(bus, NULL, all); + pnv_ioda_reserve_m64_pe(bus, NULL, all); /* * Assign PE. We might run here because of partial hotplug. @@ -3687,6 +3638,15 @@ static void pnv_pci_release_device(struct pci_dev *pdev) pnv_ioda_release_pe(pe); } +static void pnv_npu_disable_device(struct pci_dev *pdev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + struct eeh_pe *eehpe = edev ? edev->pe : NULL; + + if (eehpe && eeh_ops && eeh_ops->reset) + eeh_ops->reset(eehpe, EEH_RESET_HOT); +} + static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; @@ -3698,10 +3658,8 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .dma_bus_setup = pnv_pci_dma_bus_setup, -#ifdef CONFIG_PCI_MSI .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, -#endif .enable_device_hook = pnv_pci_enable_device_hook, .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, @@ -3722,15 +3680,14 @@ static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, -#ifdef CONFIG_PCI_MSI .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, -#endif .enable_device_hook = pnv_pci_enable_device_hook, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, .dma_set_mask = pnv_npu_dma_set_mask, .shutdown = pnv_pci_ioda_shutdown, + .disable_device = pnv_npu_disable_device, }; static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 13aef2323bbc..45fb70b4bfa7 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -160,7 +160,6 @@ exit: } EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); -#ifdef CONFIG_PCI_MSI int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); @@ -229,7 +228,6 @@ void pnv_teardown_msi_irqs(struct pci_dev *pdev) msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1); } } -#endif /* CONFIG_PCI_MSI */ /* Nicely print the contents of the PE State Tables (PEST). */ static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) @@ -602,8 +600,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) static void pnv_pci_config_check_eeh(struct pci_dn *pdn) { struct pnv_phb *phb = pdn->phb->private_data; - u8 fstate; - __be16 pcierr; + u8 fstate = 0; + __be16 pcierr = 0; unsigned int pe_no; s64 rc; @@ -1127,4 +1125,45 @@ void __init pnv_pci_init(void) set_pci_dma_ops(&dma_iommu_ops); } -machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init); +static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev; + struct pci_dn *pdn; + struct pnv_ioda_pe *pe; + struct pci_controller *hose; + struct pnv_phb *phb; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + pdev = to_pci_dev(dev); + pdn = pci_get_pdn(pdev); + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + + WARN_ON_ONCE(!phb); + if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb) + return 0; + + pe = &phb->ioda.pe_array[pdn->pe_number]; + iommu_add_device(&pe->table_group, dev); + return 0; + case BUS_NOTIFY_DEL_DEVICE: + iommu_del_device(dev); + return 0; + default: + return 0; + } +} + +static struct notifier_block pnv_tce_iommu_bus_nb = { + .notifier_call = pnv_tce_iommu_bus_notifier, +}; + +static int __init pnv_tce_iommu_bus_notifier_init(void) +{ + bus_register_notifier(&pci_bus_type, &pnv_tce_iommu_bus_nb); + return 0; +} +machine_subsys_initcall_sync(powernv, pnv_tce_iommu_bus_notifier_init); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 8b37b28e3831..8e36da379252 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -8,9 +8,6 @@ struct pci_dn; -/* Maximum possible number of ATSD MMIO registers per NPU */ -#define NV_NMMU_ATSD_REGS 8 - enum pnv_phb_type { PNV_PHB_IODA1 = 0, PNV_PHB_IODA2 = 1, @@ -65,6 +62,7 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ struct iommu_table_group table_group; + struct npu_comp *npucomp; /* 64-bit TCE bypass region */ bool tce_bypass_enabled; @@ -106,20 +104,14 @@ struct pnv_phb { struct dentry *dbgfs; #endif -#ifdef CONFIG_PCI_MSI unsigned int msi_base; unsigned int msi32_support; struct msi_bitmap msi_bmp; -#endif int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg); void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); - void (*fixup_phb)(struct pci_controller *hose); int (*init_m64)(struct pnv_phb *phb); - void (*reserve_m64_pe)(struct pci_bus *bus, - unsigned long *pe_bitmap, bool all); - struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); @@ -180,19 +172,6 @@ struct pnv_phb { unsigned int diag_data_size; u8 *diag_data; - /* Nvlink2 data */ - struct npu { - int index; - __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; - unsigned int mmio_atsd_count; - - /* Bitmask for MMIO register usage */ - unsigned long mmio_atsd_usage; - - /* Do we need to explicitly flush the nest mmu? */ - bool nmmu_flush; - } npu; - int p2p_target_count; }; @@ -210,6 +189,7 @@ extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_init_npu_phb(struct device_node *np); extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np); +extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); @@ -220,6 +200,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, + __u64 window_size, __u32 levels); extern int pnv_eeh_post_init(void); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, @@ -235,12 +217,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); -extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, - struct iommu_table *tbl); -extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); -extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); -extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); -extern int pnv_npu2_init(struct pnv_phb *phb); +extern struct iommu_table_group *pnv_try_setup_npu_table_group( + struct pnv_ioda_pe *pe); +extern struct iommu_table_group *pnv_npu_compound_attach( + struct pnv_ioda_pe *pe); /* pci-ioda-tce.c */ #define POWERNV_IOMMU_DEFAULT_LEVELS 1 diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 3b881ac66d9a..d291b618a559 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -197,6 +197,7 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) found = find_aa_index(dr_node, ala_prop, lmb_assoc, &aa_index); + of_node_put(dr_node); dlpar_free_cc_nodes(lmb_node); if (!found) { @@ -353,8 +354,11 @@ static bool lmb_is_removable(struct drmem_lmb *lmb) phys_addr = lmb->base_addr; #ifdef CONFIG_FA_DUMP - /* Don't hot-remove memory that falls in fadump boot memory area */ - if (is_fadump_boot_memory_area(phys_addr, block_sz)) + /* + * Don't hot-remove memory that falls in fadump boot memory area + * and memory that is reserved for capturing old kernel memory. + */ + if (is_fadump_memory_area(phys_addr, block_sz)) return false; #endif diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 06f02960b439..8fc8fe0b9848 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -57,7 +57,6 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) { struct iommu_table_group *table_group; struct iommu_table *tbl; - struct iommu_table_group_link *tgl; table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, node); @@ -68,22 +67,13 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) if (!tbl) goto free_group; - tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, - node); - if (!tgl) - goto free_table; - INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); - tgl->table_group = table_group; - list_add_rcu(&tgl->next, &tbl->it_group_list); table_group->tables[0] = tbl; return table_group; -free_table: - kfree(tbl); free_group: kfree(table_group); return NULL; @@ -93,23 +83,12 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group, const char *node_name) { struct iommu_table *tbl; -#ifdef CONFIG_IOMMU_API - struct iommu_table_group_link *tgl; -#endif if (!table_group) return; tbl = table_group->tables[0]; #ifdef CONFIG_IOMMU_API - tgl = list_first_entry_or_null(&tbl->it_group_list, - struct iommu_table_group_link, next); - - WARN_ON_ONCE(!tgl); - if (tgl) { - list_del_rcu(&tgl->next); - kfree(tgl); - } if (table_group->group) { iommu_group_put(table_group->group); BUG_ON(table_group->group); @@ -645,7 +624,6 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; iommu_init_table(tbl, pci->phb->node); - iommu_register_group(pci->table_group, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -756,10 +734,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) iommu_table_setparms(phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; iommu_init_table(tbl, phb->node); - iommu_register_group(PCI_DN(dn)->table_group, - pci_domain_nr(phb->bus), 0); set_iommu_table_base(&dev->dev, tbl); - iommu_add_device(&dev->dev); return; } @@ -770,11 +745,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL) dn = dn->parent; - if (dn && PCI_DN(dn)) { + if (dn && PCI_DN(dn)) set_iommu_table_base(&dev->dev, PCI_DN(dn)->table_group->tables[0]); - iommu_add_device(&dev->dev); - } else + else printk(KERN_WARNING "iommu: Device %s has no iommu table\n", pci_name(dev)); } @@ -964,6 +938,37 @@ struct failed_ddw_pdn { static LIST_HEAD(failed_ddw_pdn_list); +static phys_addr_t ddw_memory_hotplug_max(void) +{ + phys_addr_t max_addr = memory_hotplug_max(); + struct device_node *memory; + + for_each_node_by_type(memory, "memory") { + unsigned long start, size; + int ranges, n_mem_addr_cells, n_mem_size_cells, len; + const __be32 *memcell_buf; + + memcell_buf = of_get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + n_mem_addr_cells = of_n_addr_cells(memory); + n_mem_size_cells = of_n_size_cells(memory); + + /* ranges in cell */ + ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); + + start = of_read_number(memcell_buf, n_mem_addr_cells); + memcell_buf += n_mem_addr_cells; + size = of_read_number(memcell_buf, n_mem_size_cells); + memcell_buf += n_mem_size_cells; + + max_addr = max_t(phys_addr_t, max_addr, start + size); + } + + return max_addr; +} + /* * If the PE supports dynamic dma windows, and there is space for a table * that can map all pages in a linear offset, then setup such a table, @@ -1053,7 +1058,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) } /* verify the window * number of ptes will map the partition */ /* check largest block * page size > max memory hotplug addr */ - max_addr = memory_hotplug_max(); + max_addr = ddw_memory_hotplug_max(); if (query.largest_available_block < (max_addr >> page_shift)) { dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u " "%llu-sized pages\n", max_addr, query.largest_available_block, @@ -1190,7 +1195,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) } set_iommu_table_base(&dev->dev, pci->table_group->tables[0]); - iommu_add_device(&dev->dev); + iommu_add_device(pci->table_group, &dev->dev); } static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) @@ -1395,4 +1400,27 @@ static int __init disable_multitce(char *str) __setup("multitce=", disable_multitce); +static int tce_iommu_bus_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + switch (action) { + case BUS_NOTIFY_DEL_DEVICE: + iommu_del_device(dev); + return 0; + default: + return 0; + } +} + +static struct notifier_block tce_iommu_bus_nb = { + .notifier_call = tce_iommu_bus_notifier, +}; + +static int __init tce_iommu_bus_notifier_init(void) +{ + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); + return 0; +} machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init); diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 41d8a4d1d02e..7725825d887d 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -29,6 +29,7 @@ #include <asm/pci-bridge.h> #include <asm/prom.h> #include <asm/ppc-pci.h> +#include <asm/pci.h> #include "pseries.h" #if 0 @@ -237,6 +238,8 @@ static void __init pSeries_request_regions(void) void __init pSeries_final_fixup(void) { + struct pci_controller *hose; + pSeries_request_regions(); eeh_probe_devices(); @@ -246,6 +249,25 @@ void __init pSeries_final_fixup(void) ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable; #endif + list_for_each_entry(hose, &hose_list, list_node) { + struct device_node *dn = hose->dn, *nvdn; + + while (1) { + dn = of_find_all_nodes(dn); + if (!dn) + break; + nvdn = of_parse_phandle(dn, "ibm,nvlink", 0); + if (!nvdn) + continue; + if (!of_device_is_compatible(nvdn, "ibm,npu-link")) + continue; + if (!of_device_is_compatible(nvdn->parent, + "ibm,power9-npu")) + continue; + WARN_ON_ONCE(pnv_npu2_init(hose)); + break; + } + } } /* diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c index a27f40eb57b1..27f0a915c8a9 100644 --- a/arch/powerpc/platforms/pseries/pmem.c +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -52,8 +52,8 @@ static ssize_t pmem_drc_add_node(u32 drc_index) /* NB: The of reconfig notifier creates platform device from the node */ rc = dlpar_attach_node(dn, pmem_node); if (rc) { - pr_err("Failed to attach node %s, rc: %d, drc index: %x\n", - dn->name, rc, drc_index); + pr_err("Failed to attach node %pOF, rc: %d, drc index: %x\n", + dn, rc, drc_index); if (dlpar_release_drc(drc_index)) dlpar_free_cc_nodes(dn); @@ -93,8 +93,8 @@ static ssize_t pmem_drc_remove_node(u32 drc_index) rc = dlpar_release_drc(drc_index); if (rc) { - pr_err("Failed to release drc (%x) for CPU %s, rc: %d\n", - drc_index, dn->name, rc); + pr_err("Failed to release drc (%x) for CPU %pOFn, rc: %d\n", + drc_index, dn, rc); dlpar_attach_node(dn, pmem_node); return rc; } diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 4078a05fa5c0..41f62ca27c63 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -190,7 +190,7 @@ static void __init pseries_setup_i8259_cascade(void) of_node_put(old); if (np == NULL) break; - if (strcmp(np->name, "pci") != 0) + if (!of_node_name_eq(np, "pci")) continue; addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL); if (addrp == NULL) diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c index c5b902b86b44..ec46aa84d18d 100644 --- a/drivers/ide/pmac.c +++ b/drivers/ide/pmac.c @@ -1045,7 +1045,7 @@ static int pmac_ide_setup_device(pmac_ide_hwif_t *pmif, struct ide_hw *hw) d.port_ops = &pmac_ide_ata4_port_ops; d.udma_mask = ATA_UDMA5; } else if (of_device_is_compatible(np, "keylargo-ata")) { - if (strcmp(np->name, "ata-4") == 0) { + if (of_node_name_eq(np, "ata-4")) { pmif->kind = controller_kl_ata4; d.port_ops = &pmac_ide_ata4_port_ops; d.udma_mask = ATA_UDMA4; diff --git a/drivers/macintosh/ans-lcd.c b/drivers/macintosh/ans-lcd.c index c8e078b911c7..ef0c2366cf59 100644 --- a/drivers/macintosh/ans-lcd.c +++ b/drivers/macintosh/ans-lcd.c @@ -160,7 +160,7 @@ anslcd_init(void) struct device_node* node; node = of_find_node_by_name(NULL, "lcd"); - if (!node || !node->parent || strcmp(node->parent->name, "gc")) { + if (!node || !of_node_name_eq(node->parent, "gc")) { of_node_put(node); return -ENODEV; } diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c index bc8418801224..3543a82081de 100644 --- a/drivers/macintosh/macio_asic.c +++ b/drivers/macintosh/macio_asic.c @@ -190,11 +190,11 @@ static int macio_resource_quirks(struct device_node *np, struct resource *res, return 0; /* Grand Central has too large resource 0 on some machines */ - if (index == 0 && !strcmp(np->name, "gc")) + if (index == 0 && of_node_name_eq(np, "gc")) res->end = res->start + 0x1ffff; /* Airport has bogus resource 2 */ - if (index >= 2 && !strcmp(np->name, "radio")) + if (index >= 2 && of_node_name_eq(np, "radio")) return 1; #ifndef CONFIG_PPC64 @@ -207,20 +207,20 @@ static int macio_resource_quirks(struct device_node *np, struct resource *res, * level of hierarchy, but I don't really feel the need * for it */ - if (!strcmp(np->name, "escc")) + if (of_node_name_eq(np, "escc")) return 1; /* ESCC has bogus resources >= 3 */ - if (index >= 3 && !(strcmp(np->name, "ch-a") && - strcmp(np->name, "ch-b"))) + if (index >= 3 && (of_node_name_eq(np, "ch-a") || + of_node_name_eq(np, "ch-b"))) return 1; /* Media bay has too many resources, keep only first one */ - if (index > 0 && !strcmp(np->name, "media-bay")) + if (index > 0 && of_node_name_eq(np, "media-bay")) return 1; /* Some older IDE resources have bogus sizes */ - if (!strcmp(np->name, "IDE") || !strcmp(np->name, "ATA") || + if (of_node_name_eq(np, "IDE") || of_node_name_eq(np, "ATA") || of_node_is_type(np, "ide") || of_node_is_type(np, "ata")) { if (index == 0 && (res->end - res->start) > 0xfff) res->end = res->start + 0xfff; @@ -260,7 +260,7 @@ static void macio_add_missing_resources(struct macio_dev *dev) irq_base = 64; /* Fix SCC */ - if (strcmp(np->name, "ch-a") == 0) { + if (of_node_name_eq(np, "ch-a")) { macio_create_fixup_irq(dev, 0, 15 + irq_base); macio_create_fixup_irq(dev, 1, 4 + irq_base); macio_create_fixup_irq(dev, 2, 5 + irq_base); @@ -268,18 +268,18 @@ static void macio_add_missing_resources(struct macio_dev *dev) } /* Fix media-bay */ - if (strcmp(np->name, "media-bay") == 0) { + if (of_node_name_eq(np, "media-bay")) { macio_create_fixup_irq(dev, 0, 29 + irq_base); printk(KERN_INFO "macio: fixed media-bay irq on gatwick\n"); } /* Fix left media bay childs */ - if (dev->media_bay != NULL && strcmp(np->name, "floppy") == 0) { + if (dev->media_bay != NULL && of_node_name_eq(np, "floppy")) { macio_create_fixup_irq(dev, 0, 19 + irq_base); macio_create_fixup_irq(dev, 1, 1 + irq_base); printk(KERN_INFO "macio: fixed left floppy irqs\n"); } - if (dev->media_bay != NULL && strcasecmp(np->name, "ata4") == 0) { + if (dev->media_bay != NULL && of_node_name_eq(np, "ata4")) { macio_create_fixup_irq(dev, 0, 14 + irq_base); macio_create_fixup_irq(dev, 0, 3 + irq_base); printk(KERN_INFO "macio: fixed left ide irqs\n"); @@ -438,11 +438,8 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, static int macio_skip_device(struct device_node *np) { - if (strncmp(np->name, "battery", 7) == 0) - return 1; - if (strncmp(np->name, "escc-legacy", 11) == 0) - return 1; - return 0; + return of_node_name_prefix(np, "battery") || + of_node_name_prefix(np, "escc-legacy"); } /** @@ -489,9 +486,9 @@ static void macio_pci_add_devices(struct macio_chip *chip) root_res); if (mdev == NULL) of_node_put(np); - else if (strncmp(np->name, "media-bay", 9) == 0) + else if (of_node_name_prefix(np, "media-bay")) mbdev = mdev; - else if (strncmp(np->name, "escc", 4) == 0) + else if (of_node_name_prefix(np, "escc")) sdev = mdev; } diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index 1f29d2413c74..3940e2a032f7 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c @@ -376,18 +376,19 @@ static int rackmeter_probe(struct macio_dev* mdev, pr_debug("rackmeter_probe()\n"); /* Get i2s-a node */ - while ((i2s = of_get_next_child(mdev->ofdev.dev.of_node, i2s)) != NULL) - if (strcmp(i2s->name, "i2s-a") == 0) - break; + for_each_child_of_node(mdev->ofdev.dev.of_node, i2s) + if (of_node_name_eq(i2s, "i2s-a")) + break; + if (i2s == NULL) { pr_debug(" i2s-a child not found\n"); goto bail; } /* Get lightshow or virtual sound */ - while ((np = of_get_next_child(i2s, np)) != NULL) { - if (strcmp(np->name, "lightshow") == 0) + for_each_child_of_node(i2s, np) { + if (of_node_name_eq(np, "lightshow")) break; - if ((strcmp(np->name, "sound") == 0) && + if (of_node_name_eq(np, "sound") && of_get_property(np, "virtual", NULL) != NULL) break; } diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 60f57e2abf21..ac0cf37d6239 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -318,8 +318,8 @@ int __init find_via_pmu(void) PMU_INT_ADB | PMU_INT_TICK; - if (vias->parent->name && ((strcmp(vias->parent->name, "ohare") == 0) - || of_device_is_compatible(vias->parent, "ohare"))) + if (of_node_name_eq(vias->parent, "ohare") || + of_device_is_compatible(vias->parent, "ohare")) pmu_kind = PMU_OHARE_BASED; else if (of_device_is_compatible(vias->parent, "paddington")) pmu_kind = PMU_PADDINGTON_BASED; diff --git a/drivers/macintosh/windfarm_lm87_sensor.c b/drivers/macintosh/windfarm_lm87_sensor.c index 35aa571d498a..09724acd70b6 100644 --- a/drivers/macintosh/windfarm_lm87_sensor.c +++ b/drivers/macintosh/windfarm_lm87_sensor.c @@ -110,8 +110,8 @@ static int wf_lm87_probe(struct i2c_client *client, * the Xserve G5 has several lm87's. However, for now we only * care about the internal temperature sensor */ - while ((np = of_get_next_child(client->dev.of_node, np)) != NULL) { - if (strcmp(np->name, "int-temp")) + for_each_child_of_node(client->dev.of_node, np) { + if (!of_node_name_eq(np, "int-temp")) continue; loc = of_get_property(np, "location", NULL); if (!loc) diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c index 86d65462a61c..2cb9652a9998 100644 --- a/drivers/macintosh/windfarm_smu_controls.c +++ b/drivers/macintosh/windfarm_smu_controls.c @@ -267,7 +267,7 @@ static int __init smu_controls_init(void) /* Look for RPM fans */ for (fans = NULL; (fans = of_get_next_child(smu, fans)) != NULL;) - if (!strcmp(fans->name, "rpm-fans") || + if (of_node_name_eq(fans, "rpm-fans") || of_device_is_compatible(fans, "smu-rpm-fans")) break; for (fan = NULL; @@ -287,7 +287,7 @@ static int __init smu_controls_init(void) /* Look for PWM fans */ for (fans = NULL; (fans = of_get_next_child(smu, fans)) != NULL;) - if (!strcmp(fans->name, "pwm-fans")) + if (of_node_name_eq(fans, "pwm-fans")) break; for (fan = NULL; fans && (fan = of_get_next_child(fans, fan)) != NULL;) { diff --git a/drivers/macintosh/windfarm_smu_sensors.c b/drivers/macintosh/windfarm_smu_sensors.c index 1ba86de93f92..a58f6733381a 100644 --- a/drivers/macintosh/windfarm_smu_sensors.c +++ b/drivers/macintosh/windfarm_smu_sensors.c @@ -424,7 +424,7 @@ static int __init smu_sensors_init(void) /* Look for sensors subdir */ for (sensors = NULL; (sensors = of_get_next_child(smu, sensors)) != NULL;) - if (!strcmp(sensors->name, "sensors")) + if (of_node_name_eq(sensors, "sensors")) break; of_node_put(smu); diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index 57a6bb1fd3c9..8f2c5d8bd2ee 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -318,7 +318,7 @@ static int read_afu_name(struct pci_dev *dev, struct ocxl_fn_config *fn, if (rc) return rc; ptr = (u32 *) &afu->name[i]; - *ptr = val; + *ptr = le32_to_cpu((__force __le32) val); } afu->name[OCXL_AFU_NAME_SZ - 1] = '\0'; /* play safe */ return 0; diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 42dc1d3d71cf..d0f8e4f5a039 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -38,3 +38,9 @@ config VFIO_PCI_IGD and LPC bridge config space. To enable Intel IGD assignment through vfio-pci, say Y. + +config VFIO_PCI_NVLINK2 + def_bool y + depends on VFIO_PCI && PPC_POWERNV + help + VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 76d8ec058edd..9662c063a6b1 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,5 +1,6 @@ vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o +vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h new file mode 100644 index 000000000000..228ccdb8d1c8 --- /dev/null +++ b/drivers/vfio/pci/trace.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * VFIO PCI mmap/mmap_fault tracepoints + * + * Copyright (C) 2018 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfio_pci + +#if !defined(_TRACE_VFIO_PCI_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VFIO_PCI_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(vfio_pci_nvgpu_mmap_fault, + TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua, + vm_fault_t ret), + TP_ARGS(pdev, hpa, ua, ret), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, hpa) + __field(unsigned long, ua) + __field(int, ret) + ), + + TP_fast_assign( + __entry->name = dev_name(&pdev->dev), + __entry->hpa = hpa; + __entry->ua = ua; + __entry->ret = ret; + ), + + TP_printk("%s: %lx -> %lx ret=%d", __entry->name, __entry->hpa, + __entry->ua, __entry->ret) +); + +TRACE_EVENT(vfio_pci_nvgpu_mmap, + TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua, + unsigned long size, int ret), + TP_ARGS(pdev, hpa, ua, size, ret), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, hpa) + __field(unsigned long, ua) + __field(unsigned long, size) + __field(int, ret) + ), + + TP_fast_assign( + __entry->name = dev_name(&pdev->dev), + __entry->hpa = hpa; + __entry->ua = ua; + __entry->size = size; + __entry->ret = ret; + ), + + TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa, + __entry->ua, __entry->size, __entry->ret) +); + +TRACE_EVENT(vfio_pci_npu2_mmap, + TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua, + unsigned long size, int ret), + TP_ARGS(pdev, hpa, ua, size, ret), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, hpa) + __field(unsigned long, ua) + __field(unsigned long, size) + __field(int, ret) + ), + + TP_fast_assign( + __entry->name = dev_name(&pdev->dev), + __entry->hpa = hpa; + __entry->ua = ua; + __entry->size = size; + __entry->ret = ret; + ), + + TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa, + __entry->ua, __entry->size, __entry->ret) +); + +#endif /* _TRACE_VFIO_PCI_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 50cdedfca9fe..a89fa5d4e877 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -289,14 +289,37 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) if (ret) { dev_warn(&vdev->pdev->dev, "Failed to setup Intel IGD regions\n"); - vfio_pci_disable(vdev); - return ret; + goto disable_exit; + } + } + + if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && + IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { + ret = vfio_pci_nvdia_v100_nvlink2_init(vdev); + if (ret && ret != -ENODEV) { + dev_warn(&vdev->pdev->dev, + "Failed to setup NVIDIA NV2 RAM region\n"); + goto disable_exit; + } + } + + if (pdev->vendor == PCI_VENDOR_ID_IBM && + IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { + ret = vfio_pci_ibm_npu2_init(vdev); + if (ret && ret != -ENODEV) { + dev_warn(&vdev->pdev->dev, + "Failed to setup NVIDIA NV2 ATSD region\n"); + goto disable_exit; } } vfio_pci_probe_mmaps(vdev); return 0; + +disable_exit: + vfio_pci_disable(vdev); + return ret; } static void vfio_pci_disable(struct vfio_pci_device *vdev) @@ -750,6 +773,12 @@ static long vfio_pci_ioctl(void *device_data, if (ret) return ret; + if (vdev->region[i].ops->add_capability) { + ret = vdev->region[i].ops->add_capability(vdev, + &vdev->region[i], &caps); + if (ret) + return ret; + } } } @@ -1117,6 +1146,15 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) return -EINVAL; if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; + if (index >= VFIO_PCI_NUM_REGIONS) { + int regnum = index - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_region *region = vdev->region + regnum; + + if (region && region->ops && region->ops->mmap && + (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) + return region->ops->mmap(vdev, region, vma); + return -EINVAL; + } if (index >= VFIO_PCI_ROM_REGION_INDEX) return -EINVAL; if (!vdev->bar_mmap_supported[index]) diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c new file mode 100644 index 000000000000..054a2cf9dd8e --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_nvlink2.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. + * + * Copyright (C) 2018 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Register an on-GPU RAM region for cacheable access. + * + * Derived from original vfio_pci_igd.c: + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <[email protected]> + */ + +#include <linux/io.h> +#include <linux/pci.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/sched/mm.h> +#include <linux/mmu_context.h> +#include <asm/kvm_ppc.h> +#include "vfio_pci_private.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); +EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); + +struct vfio_pci_nvgpu_data { + unsigned long gpu_hpa; /* GPU RAM physical address */ + unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ + unsigned long useraddr; /* GPU RAM userspace address */ + unsigned long size; /* Size of the GPU RAM window (usually 128GB) */ + struct mm_struct *mm; + struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */ + struct pci_dev *gpdev; + struct notifier_block group_notifier; +}; + +static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_nvgpu_data *data = vdev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK; + size_t sizealigned; + void __iomem *ptr; + + if (pos >= vdev->region[i].size) + return -EINVAL; + + count = min(count, (size_t)(vdev->region[i].size - pos)); + + /* + * We map only a bit of GPU RAM for a short time instead of mapping it + * for the guest lifetime as: + * + * 1) we do not know GPU RAM size, only aperture which is 4-8 times + * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture); + * 2) mapping GPU RAM allows CPU to prefetch and if this happens + * before NVLink bridge is reset (which fences GPU RAM), + * hardware management interrupts (HMI) might happen, this + * will freeze NVLink bridge. + * + * This is not fast path anyway. + */ + sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE); + ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned); + if (!ptr) + return -EFAULT; + + if (iswrite) { + if (copy_from_user(ptr + posoff, buf, count)) + count = -EFAULT; + else + *ppos += count; + } else { + if (copy_to_user(buf, ptr + posoff, count)) + count = -EFAULT; + else + *ppos += count; + } + + iounmap(ptr); + + return count; +} + +static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_nvgpu_data *data = region->data; + long ret; + + /* If there were any mappings at all... */ + if (data->mm) { + ret = mm_iommu_put(data->mm, data->mem); + WARN_ON(ret); + + mmdrop(data->mm); + } + + vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, + &data->group_notifier); + + pnv_npu2_unmap_lpar_dev(data->gpdev); + + kfree(data); +} + +static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf) +{ + vm_fault_t ret; + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_region *region = vma->vm_private_data; + struct vfio_pci_nvgpu_data *data = region->data; + unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT; + unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT; + unsigned long vm_pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + unsigned long pfn = nv2pg + vm_pgoff + vmf_off; + + ret = vmf_insert_pfn(vma, vmf->address, pfn); + trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT, + vmf->address, ret); + + return ret; +} + +static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = { + .fault = vfio_pci_nvgpu_mmap_fault, +}; + +static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, struct vm_area_struct *vma) +{ + int ret; + struct vfio_pci_nvgpu_data *data = region->data; + + if (data->useraddr) + return -EPERM; + + if (vma->vm_end - vma->vm_start > data->size) + return -EINVAL; + + vma->vm_private_data = region; + vma->vm_flags |= VM_PFNMAP; + vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops; + + /* + * Calling mm_iommu_newdev() here once as the region is not + * registered yet and therefore right initialization will happen now. + * Other places will use mm_iommu_find() which returns + * registered @mem and does not go gup(). + */ + data->useraddr = vma->vm_start; + data->mm = current->mm; + + atomic_inc(&data->mm->mm_count); + ret = (int) mm_iommu_newdev(data->mm, data->useraddr, + (vma->vm_end - vma->vm_start) >> PAGE_SHIFT, + data->gpu_hpa, &data->mem); + + trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr, + vma->vm_end - vma->vm_start, ret); + + return ret; +} + +static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, struct vfio_info_cap *caps) +{ + struct vfio_pci_nvgpu_data *data = region->data; + struct vfio_region_info_cap_nvlink2_ssatgt cap = { 0 }; + + cap.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT; + cap.header.version = 1; + cap.tgt = data->gpu_tgt; + + return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); +} + +static const struct vfio_pci_regops vfio_pci_nvgpu_regops = { + .rw = vfio_pci_nvgpu_rw, + .release = vfio_pci_nvgpu_release, + .mmap = vfio_pci_nvgpu_mmap, + .add_capability = vfio_pci_nvgpu_add_capability, +}; + +static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, + unsigned long action, void *opaque) +{ + struct kvm *kvm = opaque; + struct vfio_pci_nvgpu_data *data = container_of(nb, + struct vfio_pci_nvgpu_data, + group_notifier); + + if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm && + pnv_npu2_map_lpar_dev(data->gpdev, + kvm->arch.lpid, MSR_DR | MSR_PR)) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) +{ + int ret; + u64 reg[2]; + u64 tgt = 0; + struct device_node *npu_node, *mem_node; + struct pci_dev *npu_dev; + struct vfio_pci_nvgpu_data *data; + uint32_t mem_phandle = 0; + unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM; + + /* + * PCI config space does not tell us about NVLink presense but + * platform does, use this. + */ + npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0); + if (!npu_dev) + return -ENODEV; + + npu_node = pci_device_to_OF_node(npu_dev); + if (!npu_node) + return -EINVAL; + + if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) + return -EINVAL; + + mem_node = of_find_node_by_phandle(mem_phandle); + if (!mem_node) + return -EINVAL; + + if (of_property_read_variable_u64_array(mem_node, "reg", reg, + ARRAY_SIZE(reg), ARRAY_SIZE(reg)) != + ARRAY_SIZE(reg)) + return -EINVAL; + + if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { + dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); + return -EFAULT; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->gpu_hpa = reg[0]; + data->gpu_tgt = tgt; + data->size = reg[1]; + + dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa, + data->gpu_hpa + data->size - 1); + + data->gpdev = vdev->pdev; + data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier; + + ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, + &events, &data->group_notifier); + if (ret) + goto free_exit; + + /* + * We have just set KVM, we do not need the listener anymore. + * Also, keeping it registered means that if more than one GPU is + * assigned, we will get several similar notifiers notifying about + * the same device again which does not help with anything. + */ + vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, + &data->group_notifier); + + ret = vfio_pci_register_dev_region(vdev, + PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, + &vfio_pci_nvgpu_regops, + data->size, + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP, + data); + if (ret) + goto free_exit; + + return 0; +free_exit: + kfree(data); + + return ret; +} + +/* + * IBM NPU2 bridge + */ +struct vfio_pci_npu2_data { + void *base; /* ATSD register virtual address, for emulated access */ + unsigned long mmio_atsd; /* ATSD physical address */ + unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ + unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ +}; + +static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_npu2_data *data = vdev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (pos >= vdev->region[i].size) + return -EINVAL; + + count = min(count, (size_t)(vdev->region[i].size - pos)); + + if (iswrite) { + if (copy_from_user(data->base + pos, buf, count)) + return -EFAULT; + } else { + if (copy_to_user(buf, data->base + pos, count)) + return -EFAULT; + } + *ppos += count; + + return count; +} + +static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, struct vm_area_struct *vma) +{ + int ret; + struct vfio_pci_npu2_data *data = region->data; + unsigned long req_len = vma->vm_end - vma->vm_start; + + if (req_len != PAGE_SIZE) + return -EINVAL; + + vma->vm_flags |= VM_PFNMAP; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT, + req_len, vma->vm_page_prot); + trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start, + vma->vm_end - vma->vm_start, ret); + + return ret; +} + +static void vfio_pci_npu2_release(struct vfio_pci_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_npu2_data *data = region->data; + + memunmap(data->base); + kfree(data); +} + +static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, struct vfio_info_cap *caps) +{ + struct vfio_pci_npu2_data *data = region->data; + struct vfio_region_info_cap_nvlink2_ssatgt captgt = { 0 }; + struct vfio_region_info_cap_nvlink2_lnkspd capspd = { 0 }; + int ret; + + captgt.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT; + captgt.header.version = 1; + captgt.tgt = data->gpu_tgt; + + capspd.header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD; + capspd.header.version = 1; + capspd.link_speed = data->link_speed; + + ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt)); + if (ret) + return ret; + + return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd)); +} + +static const struct vfio_pci_regops vfio_pci_npu2_regops = { + .rw = vfio_pci_npu2_rw, + .mmap = vfio_pci_npu2_mmap, + .release = vfio_pci_npu2_release, + .add_capability = vfio_pci_npu2_add_capability, +}; + +int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) +{ + int ret; + struct vfio_pci_npu2_data *data; + struct device_node *nvlink_dn; + u32 nvlink_index = 0; + struct pci_dev *npdev = vdev->pdev; + struct device_node *npu_node = pci_device_to_OF_node(npdev); + struct pci_controller *hose = pci_bus_to_host(npdev->bus); + u64 mmio_atsd = 0; + u64 tgt = 0; + u32 link_speed = 0xff; + + /* + * PCI config space does not tell us about NVLink presense but + * platform does, use this. + */ + if (!pnv_pci_get_gpu_dev(vdev->pdev)) + return -ENODEV; + + /* + * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links + * so we can allocate one register per link, using nvlink index as + * a key. + * There is always at least one ATSD register so as long as at least + * NVLink bridge #0 is passed to the guest, ATSD will be available. + */ + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return -ENODEV; + + if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index, + &mmio_atsd)) { + dev_warn(&vdev->pdev->dev, "No available ATSD found\n"); + mmio_atsd = 0; + } + + if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { + dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); + return -EFAULT; + } + + if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) { + dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n"); + return -EFAULT; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->mmio_atsd = mmio_atsd; + data->gpu_tgt = tgt; + data->link_speed = link_speed; + if (data->mmio_atsd) { + data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT); + if (!data->base) { + ret = -ENOMEM; + goto free_exit; + } + } + + /* + * We want to expose the capability even if this specific NVLink + * did not get its own ATSD register because capabilities + * belong to VFIO regions and normally there will be ATSD register + * assigned to the NVLink bridge. + */ + ret = vfio_pci_register_dev_region(vdev, + PCI_VENDOR_ID_IBM | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, + &vfio_pci_npu2_regops, + data->mmio_atsd ? PAGE_SIZE : 0, + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP, + data); + if (ret) + goto free_exit; + + return 0; + +free_exit: + kfree(data); + + return ret; +} diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index cde3b5d3441a..127071b84dd7 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -59,6 +59,12 @@ struct vfio_pci_regops { size_t count, loff_t *ppos, bool iswrite); void (*release)(struct vfio_pci_device *vdev, struct vfio_pci_region *region); + int (*mmap)(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma); + int (*add_capability)(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, + struct vfio_info_cap *caps); }; struct vfio_pci_region { @@ -157,4 +163,18 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) return -ENODEV; } #endif +#ifdef CONFIG_VFIO_PCI_NVLINK2 +extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev); +extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev); +#else +static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) +{ + return -ENODEV; +} + +static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) +{ + return -ENODEV; +} +#endif #endif /* VFIO_PCI_PRIVATE_H */ diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index b30926e11d87..c424913324e3 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -152,11 +152,12 @@ static long tce_iommu_unregister_pages(struct tce_container *container, struct mm_iommu_table_group_mem_t *mem; struct tce_iommu_prereg *tcemem; bool found = false; + long ret; if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) return -EINVAL; - mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); + mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); if (!mem) return -ENOENT; @@ -168,9 +169,13 @@ static long tce_iommu_unregister_pages(struct tce_container *container, } if (!found) - return -ENOENT; + ret = -ENOENT; + else + ret = tce_iommu_prereg_free(container, tcemem); - return tce_iommu_prereg_free(container, tcemem); + mm_iommu_put(container->mm, mem); + + return ret; } static long tce_iommu_register_pages(struct tce_container *container, @@ -185,22 +190,24 @@ static long tce_iommu_register_pages(struct tce_container *container, ((vaddr + size) < vaddr)) return -EINVAL; - mem = mm_iommu_find(container->mm, vaddr, entries); + mem = mm_iommu_get(container->mm, vaddr, entries); if (mem) { list_for_each_entry(tcemem, &container->prereg_list, next) { - if (tcemem->mem == mem) - return -EBUSY; + if (tcemem->mem == mem) { + ret = -EBUSY; + goto put_exit; + } } + } else { + ret = mm_iommu_new(container->mm, vaddr, entries, &mem); + if (ret) + return ret; } - ret = mm_iommu_get(container->mm, vaddr, entries, &mem); - if (ret) - return ret; - tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); if (!tcemem) { - mm_iommu_put(container->mm, mem); - return -ENOMEM; + ret = -ENOMEM; + goto put_exit; } tcemem->mem = mem; @@ -209,10 +216,22 @@ static long tce_iommu_register_pages(struct tce_container *container, container->enabled = true; return 0; + +put_exit: + mm_iommu_put(container->mm, mem); + return ret; } -static bool tce_page_is_contained(struct page *page, unsigned page_shift) +static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, + unsigned int page_shift) { + struct page *page; + unsigned long size = 0; + + if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) + return size == (1UL << page_shift); + + page = pfn_to_page(hpa >> PAGE_SHIFT); /* * Check that the TCE table granularity is not bigger than the size of * a page we just found. Otherwise the hardware can get access to @@ -371,6 +390,7 @@ static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; struct tce_iommu_group *tcegrp; + struct tce_iommu_prereg *tcemem, *tmtmp; long i; while (tce_groups_attached(container)) { @@ -393,13 +413,8 @@ static void tce_iommu_release(void *iommu_data) tce_iommu_free_table(container, tbl); } - while (!list_empty(&container->prereg_list)) { - struct tce_iommu_prereg *tcemem; - - tcemem = list_first_entry(&container->prereg_list, - struct tce_iommu_prereg, next); - WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem)); - } + list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next) + WARN_ON(tce_iommu_prereg_free(container, tcemem)); tce_iommu_disable(container); if (container->mm) @@ -492,7 +507,8 @@ static int tce_iommu_clear(struct tce_container *container, direction = DMA_NONE; oldhpa = 0; - ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); + ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa, + &direction); if (ret) continue; @@ -530,7 +546,6 @@ static long tce_iommu_build(struct tce_container *container, enum dma_data_direction direction) { long i, ret = 0; - struct page *page; unsigned long hpa; enum dma_data_direction dirtmp; @@ -541,15 +556,16 @@ static long tce_iommu_build(struct tce_container *container, if (ret) break; - page = pfn_to_page(hpa >> PAGE_SHIFT); - if (!tce_page_is_contained(page, tbl->it_page_shift)) { + if (!tce_page_is_contained(container->mm, hpa, + tbl->it_page_shift)) { ret = -EPERM; break; } hpa |= offset; dirtmp = direction; - ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); + ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, + &dirtmp); if (ret) { tce_iommu_unuse_page(container, hpa); pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", @@ -576,7 +592,6 @@ static long tce_iommu_build_v2(struct tce_container *container, enum dma_data_direction direction) { long i, ret = 0; - struct page *page; unsigned long hpa; enum dma_data_direction dirtmp; @@ -589,8 +604,8 @@ static long tce_iommu_build_v2(struct tce_container *container, if (ret) break; - page = pfn_to_page(hpa >> PAGE_SHIFT); - if (!tce_page_is_contained(page, tbl->it_page_shift)) { + if (!tce_page_is_contained(container->mm, hpa, + tbl->it_page_shift)) { ret = -EPERM; break; } @@ -603,7 +618,8 @@ static long tce_iommu_build_v2(struct tce_container *container, if (mm_iommu_mapped_inc(mem)) break; - ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); + ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, + &dirtmp); if (ret) { /* dirtmp cannot be DMA_NONE here */ tce_iommu_unuse_page_v2(container, tbl, entry + i); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 813102810f53..02bb7ad6e986 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -354,6 +354,21 @@ struct vfio_region_gfx_edid { }; /* + * 10de vendor sub-type + * + * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space. + */ +#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1) + +/* + * 1014 vendor sub-type + * + * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU + * to do TLB invalidation on a GPU. + */ +#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) + +/* * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped * which allows direct access to non-MSIX registers which happened to be within * the same system page. @@ -363,6 +378,33 @@ struct vfio_region_gfx_edid { */ #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3 +/* + * Capability with compressed real address (aka SSA - small system address) + * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing + * and by the userspace to associate a NVLink bridge with a GPU. + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT 4 + +struct vfio_region_info_cap_nvlink2_ssatgt { + struct vfio_info_cap_header header; + __u64 tgt; +}; + +/* + * Capability with an NVLink link speed. The value is read by + * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed" + * property in the device tree. The value is fixed in the hardware + * and failing to provide the correct value results in the link + * not working with no indication from the driver why. + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD 5 + +struct vfio_region_info_cap_nvlink2_lnkspd { + struct vfio_info_cap_header header; + __u32 link_speed; + __u32 __pad; +}; + /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, * struct vfio_irq_info) diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index c3ee8393dae8..208452a93e2c 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -11,6 +11,7 @@ tm-signal-context-chk-fpu tm-signal-context-chk-gpr tm-signal-context-chk-vmx tm-signal-context-chk-vsx +tm-signal-sigreturn-nt tm-vmx-unavail tm-unavailable tm-trap diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 9fc2cf6fbc92..75a685359129 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -4,7 +4,7 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \ tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \ - $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn + $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c new file mode 100644 index 000000000000..56fbf9f6bbf3 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018, Breno Leitao, Gustavo Romero, IBM Corp. + * + * A test case that creates a signal and starts a suspended transaction + * inside the signal handler. + * + * It returns from the signal handler with the CPU at suspended state, but + * without setting usercontext MSR Transaction State (TS) fields. + */ + +#define _GNU_SOURCE +#include <stdlib.h> +#include <signal.h> + +#include "utils.h" + +void trap_signal_handler(int signo, siginfo_t *si, void *uc) +{ + ucontext_t *ucp = (ucontext_t *) uc; + + asm("tbegin.; tsuspend.;"); + + /* Skip 'trap' instruction if it succeed */ + ucp->uc_mcontext.regs->nip += 4; +} + +int tm_signal_sigreturn_nt(void) +{ + struct sigaction trap_sa; + + trap_sa.sa_flags = SA_SIGINFO; + trap_sa.sa_sigaction = trap_signal_handler; + + sigaction(SIGTRAP, &trap_sa, NULL); + + raise(SIGTRAP); + + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + test_harness(tm_signal_sigreturn_nt, "tm_signal_sigreturn_nt"); +} + |