diff options
376 files changed, 5567 insertions, 3024 deletions
diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index a71b8095dbd8..2f66683500b8 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -68,3 +68,4 @@ stable kernels. | | | | | | Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | +| Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 | diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 3b8449f8ac7e..49d7c997fa1e 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -1142,16 +1142,17 @@ used by the kernel. pids.max - A read-write single value file which exists on non-root cgroups. The - default is "max". + A read-write single value file which exists on non-root + cgroups. The default is "max". - Hard limit of number of processes. + Hard limit of number of processes. pids.current - A read-only single value file which exists on all cgroups. + A read-only single value file which exists on all cgroups. - The number of processes currently in the cgroup and its descendants. + The number of processes currently in the cgroup and its + descendants. Organisational operations are not blocked by cgroup policies, so it is possible to have pids.current > pids.max. This can be done by either diff --git a/Documentation/devicetree/bindings/powerpc/4xx/emac.txt b/Documentation/devicetree/bindings/powerpc/4xx/emac.txt index 712baf6c3e24..44b842b6ca15 100644 --- a/Documentation/devicetree/bindings/powerpc/4xx/emac.txt +++ b/Documentation/devicetree/bindings/powerpc/4xx/emac.txt @@ -71,6 +71,9 @@ For Axon it can be absent, though my current driver doesn't handle phy-address yet so for now, keep 0x00ffffff in it. + - phy-handle : Used to describe configurations where a external PHY + is used. Please refer to: + Documentation/devicetree/bindings/net/ethernet.txt - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec operations (if absent the value is the same as rx-fifo-size). For Axon, either absent or 2048. @@ -81,8 +84,22 @@ offload, phandle of the TAH device node. - tah-channel : 1 cell, optional. If appropriate, channel used on the TAH engine. + - fixed-link : Fixed-link subnode describing a link to a non-MDIO + managed entity. See + Documentation/devicetree/bindings/net/fixed-link.txt + for details. + - mdio subnode : When the EMAC has a phy connected to its local + mdio, which us supported by the kernel's network + PHY library in drivers/net/phy, there must be device + tree subnode with the following required properties: + - #address-cells: Must be <1>. + - #size-cells: Must be <0>. - Example: + For PHY definitions: Please refer to + Documentation/devicetree/bindings/net/phy.txt and + Documentation/devicetree/bindings/net/ethernet.txt + + Examples: EMAC0: ethernet@40000800 { device_type = "network"; @@ -104,6 +121,48 @@ zmii-channel = <0>; }; + EMAC1: ethernet@ef600c00 { + device_type = "network"; + compatible = "ibm,emac-apm821xx", "ibm,emac4sync"; + interrupt-parent = <&EMAC1>; + interrupts = <0 1>; + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + interrupt-map = <0 &UIC2 0x10 IRQ_TYPE_LEVEL_HIGH /* Status */ + 1 &UIC2 0x14 IRQ_TYPE_LEVEL_HIGH /* Wake */>; + reg = <0xef600c00 0x000000c4>; + local-mac-address = [000000000000]; /* Filled in by U-Boot */ + mal-device = <&MAL0>; + mal-tx-channel = <0>; + mal-rx-channel = <0>; + cell-index = <0>; + max-frame-size = <9000>; + rx-fifo-size = <16384>; + tx-fifo-size = <2048>; + fifo-entry-size = <10>; + phy-mode = "rgmii"; + phy-handle = <&phy0>; + phy-map = <0x00000000>; + rgmii-device = <&RGMII0>; + rgmii-channel = <0>; + tah-device = <&TAH0>; + tah-channel = <0>; + has-inverted-stacr-oc; + has-new-stacr-staopc; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + phy0: ethernet-phy@0 { + compatible = "ethernet-phy-ieee802.3-c22"; + reg = <0>; + }; + }; + }; + + ii) McMAL node Required properties: @@ -145,4 +204,3 @@ - revision : as provided by the RGMII new version register if available. For Axon: 0x0000012a - diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index fc73eeb7b3b8..ab0230461377 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1006,7 +1006,8 @@ accept_redirects - BOOLEAN FALSE (router) forwarding - BOOLEAN - Enable IP forwarding on this interface. + Enable IP forwarding on this interface. This controls whether packets + received _on_ this interface can be forwarded. mc_forwarding - BOOLEAN Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 069450938b79..3c248f772ae6 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -951,6 +951,10 @@ This ioctl allows the user to create or modify a guest physical memory slot. When changing an existing slot, it may be moved in the guest physical memory space, or its flags may be modified. It may not be resized. Slots may not overlap in guest physical address space. +Bits 0-15 of "slot" specifies the slot id and this value should be +less than the maximum number of user memory slots supported per VM. +The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS, +if this capability is supported by the architecture. If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" specifies the address space which is being modified. They must be diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5724092db811..ee3f9c30957c 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -19,7 +19,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ... unused hole ... ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space +ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole @@ -39,6 +39,9 @@ memory window (this size is arbitrary, it can be raised later if needed). The mappings are not part of any other kernel PGD and are only available during EFI runtime calls. +The module mapping space size changes based on the CONFIG requirements for the +following fixmap section. + Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all physical memory, vmalloc/ioremap space and virtual memory map are randomized. Their order is preserved but their base will be offset early at boot time. @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Fearless Coyote # *DOCUMENTATION* diff --git a/arch/Kconfig b/arch/Kconfig index cd211a14a88f..c4d6833aacd9 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -700,6 +700,13 @@ config ARCH_MMAP_RND_COMPAT_BITS This value can be changed after boot using the /proc/sys/vm/mmap_rnd_compat_bits tunable +config HAVE_ARCH_COMPAT_MMAP_BASES + bool + help + This allows 64bit applications to invoke 32-bit mmap() syscall + and vice-versa 32-bit applications to call 64-bit mmap(). + Required for applications doing different bitness syscalls. + config HAVE_COPY_THREAD_TLS bool help diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0d4e71b42c77..454fadd077ad 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1666,7 +1666,7 @@ config ARCH_SELECT_MEMORY_MODEL config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP def_bool y depends on ARM_LPAE diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index e22089fb44dc..a3f0b3d50089 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -209,6 +209,7 @@ #define HSR_EC_IABT_HYP (0x21) #define HSR_EC_DABT (0x24) #define HSR_EC_DABT_HYP (0x25) +#define HSR_EC_MAX (0x3f) #define HSR_WFI_IS_WFE (_AC(1, UL) << 0) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index cc495d799c67..31ee468ce667 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -30,7 +30,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED #define KVM_USER_MEM_SLOTS 32 -#define KVM_PRIVATE_MEM_SLOTS 4 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HAVE_ONE_REG #define KVM_HALT_POLL_NS_DEFAULT 500000 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index c9a2103faeb9..96dba7cd8be7 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -221,6 +221,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_USER_MEM_SLOTS; + break; case KVM_CAP_MSI_DEVID: if (!kvm) r = -EINVAL; diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 4e40d1955e35..96af65a30d78 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -79,7 +79,19 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + u32 hsr = kvm_vcpu_get_hsr(vcpu); + + kvm_pr_unimpl("Unknown exception class: hsr: %#08x\n", + hsr); + + kvm_inject_undefined(vcpu); + return 1; +} + static exit_handle_fn arm_exit_handlers[] = { + [0 ... HSR_EC_MAX] = kvm_handle_unknown_ec, [HSR_EC_WFI] = kvm_handle_wfx, [HSR_EC_CP15_32] = kvm_handle_cp15_32, [HSR_EC_CP15_64] = kvm_handle_cp15_64, @@ -98,13 +110,6 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) { u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu); - if (hsr_ec >= ARRAY_SIZE(arm_exit_handlers) || - !arm_exit_handlers[hsr_ec]) { - kvm_err("Unknown exception class: hsr: %#08x\n", - (unsigned int)kvm_vcpu_get_hsr(vcpu)); - BUG(); - } - return arm_exit_handlers[hsr_ec]; } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a39029b5414e..af62bf79721a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY config ZONE_DMA def_bool y -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP def_bool y config ARCH_DMA_ADDR_T_64BIT @@ -508,6 +508,16 @@ config QCOM_FALKOR_ERRATUM_1009 If unsure, say Y. +config QCOM_QDF2400_ERRATUM_0065 + bool "QDF2400 E0065: Incorrect GITS_TYPER.ITT_Entry_size" + default y + help + On Qualcomm Datacenter Technologies QDF2400 SoC, ITS hardware reports + ITE size incorrectly. The GITS_TYPER.ITT_Entry_size field should have + been indicated as 16Bytes (0xf), not 8Bytes (0x7). + + If unsure, say Y. + endmenu @@ -1063,6 +1073,10 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config KEYS_COMPAT + def_bool y + depends on COMPAT && KEYS + endmenu menu "Power management options" diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 05310ad8c5ab..f31c48d0cd68 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -251,7 +251,7 @@ static inline bool system_supports_fpsimd(void) static inline bool system_uses_ttbr0_pan(void) { return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && - !cpus_have_cap(ARM64_HAS_PAN); + !cpus_have_const_cap(ARM64_HAS_PAN); } #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f21fd3894370..e7705e7bb07b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -30,8 +30,7 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED -#define KVM_USER_MEM_SLOTS 32 -#define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_USER_MEM_SLOTS 512 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HALT_POLL_NS_DEFAULT 500000 diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index 75a0f8acef66..fd691087dc9a 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -30,7 +30,7 @@ int arm_cpuidle_init(unsigned int cpu) } /** - * cpu_suspend() - function to enter a low-power idle state + * arm_cpuidle_suspend() - function to enter a low-power idle state * @arg: argument to pass to CPU suspend operations * * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 2a07aae5b8a2..c5c45942fb6e 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -372,12 +372,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) return 0; } -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return NOTIFY_DONE; -} - static void __kprobes kprobe_handler(struct pt_regs *regs) { struct kprobe *p, *cur_kprobe; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 1bfe30dfbfe7..fa1b18e364fc 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -135,7 +135,19 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) return ret; } +static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + u32 hsr = kvm_vcpu_get_hsr(vcpu); + + kvm_pr_unimpl("Unknown exception class: hsr: %#08x -- %s\n", + hsr, esr_get_class_string(hsr)); + + kvm_inject_undefined(vcpu); + return 1; +} + static exit_handle_fn arm_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec, [ESR_ELx_EC_WFx] = kvm_handle_wfx, [ESR_ELx_EC_CP15_32] = kvm_handle_cp15_32, [ESR_ELx_EC_CP15_64] = kvm_handle_cp15_64, @@ -162,13 +174,6 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) u32 hsr = kvm_vcpu_get_hsr(vcpu); u8 hsr_ec = ESR_ELx_EC(hsr); - if (hsr_ec >= ARRAY_SIZE(arm_exit_handlers) || - !arm_exit_handlers[hsr_ec]) { - kvm_err("Unknown exception class: hsr: %#08x -- %s\n", - hsr, esr_get_class_string(hsr)); - BUG(); - } - return arm_exit_handlers[hsr_ec]; } diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index e8e7ba2bc11f..9e1d2b75eecd 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -18,14 +18,62 @@ #include <asm/kvm_hyp.h> #include <asm/tlbflush.h> +static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) +{ + u64 val; + + /* + * With VHE enabled, we have HCR_EL2.{E2H,TGE} = {1,1}, and + * most TLB operations target EL2/EL0. In order to affect the + * guest TLBs (EL1/EL0), we need to change one of these two + * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so + * let's flip TGE before executing the TLB operation. + */ + write_sysreg(kvm->arch.vttbr, vttbr_el2); + val = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + write_sysreg(val, hcr_el2); + isb(); +} + +static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm) +{ + write_sysreg(kvm->arch.vttbr, vttbr_el2); + isb(); +} + +static hyp_alternate_select(__tlb_switch_to_guest, + __tlb_switch_to_guest_nvhe, + __tlb_switch_to_guest_vhe, + ARM64_HAS_VIRT_HOST_EXTN); + +static void __hyp_text __tlb_switch_to_host_vhe(struct kvm *kvm) +{ + /* + * We're done with the TLB operation, let's restore the host's + * view of HCR_EL2. + */ + write_sysreg(0, vttbr_el2); + write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); +} + +static void __hyp_text __tlb_switch_to_host_nvhe(struct kvm *kvm) +{ + write_sysreg(0, vttbr_el2); +} + +static hyp_alternate_select(__tlb_switch_to_host, + __tlb_switch_to_host_nvhe, + __tlb_switch_to_host_vhe, + ARM64_HAS_VIRT_HOST_EXTN); + void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { dsb(ishst); /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - write_sysreg(kvm->arch.vttbr, vttbr_el2); - isb(); + __tlb_switch_to_guest()(kvm); /* * We could do so much better if we had the VA as well. @@ -46,7 +94,7 @@ void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) dsb(ish); isb(); - write_sysreg(0, vttbr_el2); + __tlb_switch_to_host()(kvm); } void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) @@ -55,14 +103,13 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) /* Switch to requested VMID */ kvm = kern_hyp_va(kvm); - write_sysreg(kvm->arch.vttbr, vttbr_el2); - isb(); + __tlb_switch_to_guest()(kvm); __tlbi(vmalls12e1is); dsb(ish); isb(); - write_sysreg(0, vttbr_el2); + __tlb_switch_to_host()(kvm); } void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) @@ -70,14 +117,13 @@ void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm); /* Switch to requested VMID */ - write_sysreg(kvm->arch.vttbr, vttbr_el2); - isb(); + __tlb_switch_to_guest()(kvm); __tlbi(vmalle1); dsb(nsh); isb(); - write_sysreg(0, vttbr_el2); + __tlb_switch_to_host()(kvm); } void __hyp_text __kvm_flush_vm_context(void) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 55d1e9205543..687a358a3733 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -162,7 +162,7 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); vmemmap_populate(kimg_shadow_start, kimg_shadow_end, - pfn_to_nid(virt_to_pfn(_text))); + pfn_to_nid(virt_to_pfn(lm_alias(_text)))); /* * vmemmap_populate() has populated the shadow region that covers the diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 97a8bc8a095c..3a716b2dcde9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,7 +135,7 @@ config PPC select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS - select HAVE_GENERIC_RCU_GUP + select HAVE_GENERIC_GUP select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/powerpc/boot/zImage.lds.S b/arch/powerpc/boot/zImage.lds.S index 861e72109df2..f080abfc2f83 100644 --- a/arch/powerpc/boot/zImage.lds.S +++ b/arch/powerpc/boot/zImage.lds.S @@ -68,6 +68,7 @@ SECTIONS } #ifdef CONFIG_PPC64_BOOT_WRAPPER + . = ALIGN(256); .got : { __toc_start = .; diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c index 9fa046d56eba..411994551afc 100644 --- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c +++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c @@ -52,7 +52,7 @@ static int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm) { u32 *key = crypto_tfm_ctx(tfm); - *key = 0; + *key = ~0; return 0; } diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 73eb794d6163..bc5fdfd22788 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -51,6 +51,10 @@ #define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) #define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) +/* Put a PPC bit into a "normal" bit position */ +#define PPC_BITEXTRACT(bits, ppc_bit, dst_bit) \ + ((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit)) + #include <asm/barrier.h> /* Macro for generating the ***_bits() functions */ diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index f97d8cb6bdf6..ed62efe01e49 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -66,6 +66,55 @@ #define P8_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_SLB_ERRORS | \ P8_DSISR_MC_ERAT_MULTIHIT_SEC) + +/* + * Machine Check bits on power9 + */ +#define P9_SRR1_MC_LOADSTORE(srr1) (((srr1) >> PPC_BITLSHIFT(42)) & 1) + +#define P9_SRR1_MC_IFETCH(srr1) ( \ + PPC_BITEXTRACT(srr1, 45, 0) | \ + PPC_BITEXTRACT(srr1, 44, 1) | \ + PPC_BITEXTRACT(srr1, 43, 2) | \ + PPC_BITEXTRACT(srr1, 36, 3) ) + +/* 0 is reserved */ +#define P9_SRR1_MC_IFETCH_UE 1 +#define P9_SRR1_MC_IFETCH_SLB_PARITY 2 +#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT 3 +#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT 4 +#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT 5 +#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD 6 +/* 7 is reserved */ +#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT 8 +#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT 9 +/* 10 ? */ +#define P9_SRR1_MC_IFETCH_RA 11 +#define P9_SRR1_MC_IFETCH_RA_TABLEWALK 12 +#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE 13 +#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT 14 +#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN 15 + +/* DSISR bits for machine check (On Power9) */ +#define P9_DSISR_MC_UE (PPC_BIT(48)) +#define P9_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) +#define P9_DSISR_MC_LINK_LOAD_TIMEOUT (PPC_BIT(50)) +#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT (PPC_BIT(51)) +#define P9_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) +#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) +#define P9_DSISR_MC_USER_TLBIE (PPC_BIT(54)) +#define P9_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) +#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB (PPC_BIT(56)) +#define P9_DSISR_MC_RA_LOAD (PPC_BIT(57)) +#define P9_DSISR_MC_RA_TABLEWALK (PPC_BIT(58)) +#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN (PPC_BIT(59)) +#define P9_DSISR_MC_RA_FOREIGN (PPC_BIT(60)) + +/* SLB error bits */ +#define P9_DSISR_MC_SLB_ERRORS (P9_DSISR_MC_ERAT_MULTIHIT | \ + P9_DSISR_MC_SLB_PARITY_MFSLB | \ + P9_DSISR_MC_SLB_MULTIHIT_MFSLB) + enum MCE_Version { MCE_V1 = 1, }; @@ -93,6 +142,9 @@ enum MCE_ErrorType { MCE_ERROR_TYPE_SLB = 2, MCE_ERROR_TYPE_ERAT = 3, MCE_ERROR_TYPE_TLB = 4, + MCE_ERROR_TYPE_USER = 5, + MCE_ERROR_TYPE_RA = 6, + MCE_ERROR_TYPE_LINK = 7, }; enum MCE_UeErrorType { @@ -121,6 +173,32 @@ enum MCE_TlbErrorType { MCE_TLB_ERROR_MULTIHIT = 2, }; +enum MCE_UserErrorType { + MCE_USER_ERROR_INDETERMINATE = 0, + MCE_USER_ERROR_TLBIE = 1, +}; + +enum MCE_RaErrorType { + MCE_RA_ERROR_INDETERMINATE = 0, + MCE_RA_ERROR_IFETCH = 1, + MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH = 2, + MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN = 3, + MCE_RA_ERROR_LOAD = 4, + MCE_RA_ERROR_STORE = 5, + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 6, + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN = 7, + MCE_RA_ERROR_LOAD_STORE_FOREIGN = 8, +}; + +enum MCE_LinkErrorType { + MCE_LINK_ERROR_INDETERMINATE = 0, + MCE_LINK_ERROR_IFETCH_TIMEOUT = 1, + MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT = 2, + MCE_LINK_ERROR_LOAD_TIMEOUT = 3, + MCE_LINK_ERROR_STORE_TIMEOUT = 4, + MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT = 5, +}; + struct machine_check_event { enum MCE_Version version:8; /* 0x00 */ uint8_t in_use; /* 0x01 */ @@ -166,6 +244,30 @@ struct machine_check_event { uint64_t effective_address; uint8_t reserved_2[16]; } tlb_error; + + struct { + enum MCE_UserErrorType user_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } user_error; + + struct { + enum MCE_RaErrorType ra_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } ra_error; + + struct { + enum MCE_LinkErrorType link_error_type:8; + uint8_t effective_address_provided; + uint8_t reserved_1[6]; + uint64_t effective_address; + uint8_t reserved_2[16]; + } link_error; } u; }; @@ -176,8 +278,12 @@ struct mce_error_info { enum MCE_SlbErrorType slb_error_type:8; enum MCE_EratErrorType erat_error_type:8; enum MCE_TlbErrorType tlb_error_type:8; + enum MCE_UserErrorType user_error_type:8; + enum MCE_RaErrorType ra_error_type:8; + enum MCE_LinkErrorType link_error_type:8; } u; - uint8_t reserved[2]; + enum MCE_Severity severity:8; + enum MCE_Initiator initiator:8; }; #define MAX_MC_EVT 100 diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b9e3f0aca261..ecf9885ab660 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -163,11 +163,5 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index bb7a1890aeb7..e79b9daa873c 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -77,6 +77,7 @@ extern void __flush_tlb_power8(unsigned int action); extern void __flush_tlb_power9(unsigned int action); extern long __machine_check_early_realmode_p7(struct pt_regs *regs); extern long __machine_check_early_realmode_p8(struct pt_regs *regs); +extern long __machine_check_early_realmode_p9(struct pt_regs *regs); #endif /* CONFIG_PPC64 */ #if defined(CONFIG_E500) extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec); @@ -540,6 +541,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_power9, .cpu_restore = __restore_cpu_power9, .flush_tlb = __flush_tlb_power9, + .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, { /* Power9 */ @@ -559,6 +561,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_power9, .cpu_restore = __restore_cpu_power9, .flush_tlb = __flush_tlb_power9, + .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, { /* Cell Broadband Engine */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index c6923ff45131..a1475e6aef3a 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -58,6 +58,15 @@ static void mce_set_error_info(struct machine_check_event *mce, case MCE_ERROR_TYPE_TLB: mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type; break; + case MCE_ERROR_TYPE_USER: + mce->u.user_error.user_error_type = mce_err->u.user_error_type; + break; + case MCE_ERROR_TYPE_RA: + mce->u.ra_error.ra_error_type = mce_err->u.ra_error_type; + break; + case MCE_ERROR_TYPE_LINK: + mce->u.link_error.link_error_type = mce_err->u.link_error_type; + break; case MCE_ERROR_TYPE_UNKNOWN: default: break; @@ -90,13 +99,14 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->gpr3 = regs->gpr[3]; mce->in_use = 1; - mce->initiator = MCE_INITIATOR_CPU; /* Mark it recovered if we have handled it and MSR(RI=1). */ if (handled && (regs->msr & MSR_RI)) mce->disposition = MCE_DISPOSITION_RECOVERED; else mce->disposition = MCE_DISPOSITION_NOT_RECOVERED; - mce->severity = MCE_SEV_ERROR_SYNC; + + mce->initiator = mce_err->initiator; + mce->severity = mce_err->severity; /* * Populate the mce error_type and type-specific error_type. @@ -115,6 +125,15 @@ void save_mce_event(struct pt_regs *regs, long handled, } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) { mce->u.erat_error.effective_address_provided = true; mce->u.erat_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_USER) { + mce->u.user_error.effective_address_provided = true; + mce->u.user_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_RA) { + mce->u.ra_error.effective_address_provided = true; + mce->u.ra_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_LINK) { + mce->u.link_error.effective_address_provided = true; + mce->u.link_error.effective_address = addr; } else if (mce->error_type == MCE_ERROR_TYPE_UE) { mce->u.ue_error.effective_address_provided = true; mce->u.ue_error.effective_address = addr; @@ -239,6 +258,29 @@ void machine_check_print_event_info(struct machine_check_event *evt) "Parity", "Multihit", }; + static const char *mc_user_types[] = { + "Indeterminate", + "tlbie(l) invalid", + }; + static const char *mc_ra_types[] = { + "Indeterminate", + "Instruction fetch (bad)", + "Page table walk ifetch (bad)", + "Page table walk ifetch (foreign)", + "Load (bad)", + "Store (bad)", + "Page table walk Load/Store (bad)", + "Page table walk Load/Store (foreign)", + "Load/Store (foreign)", + }; + static const char *mc_link_types[] = { + "Indeterminate", + "Instruction fetch (timeout)", + "Page table walk ifetch (timeout)", + "Load (timeout)", + "Store (timeout)", + "Page table walk Load/Store (timeout)", + }; /* Print things out */ if (evt->version != MCE_V1) { @@ -315,6 +357,36 @@ void machine_check_print_event_info(struct machine_check_event *evt) printk("%s Effective address: %016llx\n", level, evt->u.tlb_error.effective_address); break; + case MCE_ERROR_TYPE_USER: + subtype = evt->u.user_error.user_error_type < + ARRAY_SIZE(mc_user_types) ? + mc_user_types[evt->u.user_error.user_error_type] + : "Unknown"; + printk("%s Error type: User [%s]\n", level, subtype); + if (evt->u.user_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.user_error.effective_address); + break; + case MCE_ERROR_TYPE_RA: + subtype = evt->u.ra_error.ra_error_type < + ARRAY_SIZE(mc_ra_types) ? + mc_ra_types[evt->u.ra_error.ra_error_type] + : "Unknown"; + printk("%s Error type: Real address [%s]\n", level, subtype); + if (evt->u.ra_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.ra_error.effective_address); + break; + case MCE_ERROR_TYPE_LINK: + subtype = evt->u.link_error.link_error_type < + ARRAY_SIZE(mc_link_types) ? + mc_link_types[evt->u.link_error.link_error_type] + : "Unknown"; + printk("%s Error type: Link [%s]\n", level, subtype); + if (evt->u.link_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.link_error.effective_address); + break; default: case MCE_ERROR_TYPE_UNKNOWN: printk("%s Error type: Unknown\n", level); @@ -341,6 +413,18 @@ uint64_t get_mce_fault_addr(struct machine_check_event *evt) if (evt->u.tlb_error.effective_address_provided) return evt->u.tlb_error.effective_address; break; + case MCE_ERROR_TYPE_USER: + if (evt->u.user_error.effective_address_provided) + return evt->u.user_error.effective_address; + break; + case MCE_ERROR_TYPE_RA: + if (evt->u.ra_error.effective_address_provided) + return evt->u.ra_error.effective_address; + break; + case MCE_ERROR_TYPE_LINK: + if (evt->u.link_error.effective_address_provided) + return evt->u.link_error.effective_address; + break; default: case MCE_ERROR_TYPE_UNKNOWN: break; diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 7353991c4ece..763d6f58caa8 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -116,6 +116,51 @@ static void flush_and_reload_slb(void) } #endif +static void flush_erat(void) +{ + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); +} + +#define MCE_FLUSH_SLB 1 +#define MCE_FLUSH_TLB 2 +#define MCE_FLUSH_ERAT 3 + +static int mce_flush(int what) +{ +#ifdef CONFIG_PPC_STD_MMU_64 + if (what == MCE_FLUSH_SLB) { + flush_and_reload_slb(); + return 1; + } +#endif + if (what == MCE_FLUSH_ERAT) { + flush_erat(); + return 1; + } + if (what == MCE_FLUSH_TLB) { + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) { + cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); + return 1; + } + } + + return 0; +} + +static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb, uint64_t erat) +{ + if ((dsisr & slb) && mce_flush(MCE_FLUSH_SLB)) + dsisr &= ~slb; + if ((dsisr & erat) && mce_flush(MCE_FLUSH_ERAT)) + dsisr &= ~erat; + if ((dsisr & tlb) && mce_flush(MCE_FLUSH_TLB)) + dsisr &= ~tlb; + /* Any other errors we don't understand? */ + if (dsisr) + return 0; + return 1; +} + static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) { long handled = 1; @@ -281,6 +326,9 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs) long handled = 1; struct mce_error_info mce_error_info = { 0 }; + mce_error_info.severity = MCE_SEV_ERROR_SYNC; + mce_error_info.initiator = MCE_INITIATOR_CPU; + srr1 = regs->msr; nip = regs->nip; @@ -352,6 +400,9 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs) long handled = 1; struct mce_error_info mce_error_info = { 0 }; + mce_error_info.severity = MCE_SEV_ERROR_SYNC; + mce_error_info.initiator = MCE_INITIATOR_CPU; + srr1 = regs->msr; nip = regs->nip; @@ -372,3 +423,189 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs) save_mce_event(regs, handled, &mce_error_info, nip, addr); return handled; } + +static int mce_handle_derror_p9(struct pt_regs *regs) +{ + uint64_t dsisr = regs->dsisr; + + return mce_handle_flush_derrors(dsisr, + P9_DSISR_MC_SLB_PARITY_MFSLB | + P9_DSISR_MC_SLB_MULTIHIT_MFSLB, + + P9_DSISR_MC_TLB_MULTIHIT_MFTLB, + + P9_DSISR_MC_ERAT_MULTIHIT); +} + +static int mce_handle_ierror_p9(struct pt_regs *regs) +{ + uint64_t srr1 = regs->msr; + + switch (P9_SRR1_MC_IFETCH(srr1)) { + case P9_SRR1_MC_IFETCH_SLB_PARITY: + case P9_SRR1_MC_IFETCH_SLB_MULTIHIT: + return mce_flush(MCE_FLUSH_SLB); + case P9_SRR1_MC_IFETCH_TLB_MULTIHIT: + return mce_flush(MCE_FLUSH_TLB); + case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT: + return mce_flush(MCE_FLUSH_ERAT); + default: + return 0; + } +} + +static void mce_get_derror_p9(struct pt_regs *regs, + struct mce_error_info *mce_err, uint64_t *addr) +{ + uint64_t dsisr = regs->dsisr; + + mce_err->severity = MCE_SEV_ERROR_SYNC; + mce_err->initiator = MCE_INITIATOR_CPU; + + if (dsisr & P9_DSISR_MC_USER_TLBIE) + *addr = regs->nip; + else + *addr = regs->dar; + + if (dsisr & P9_DSISR_MC_UE) { + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; + } else if (dsisr & P9_DSISR_MC_UE_TABLEWALK) { + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; + } else if (dsisr & P9_DSISR_MC_LINK_LOAD_TIMEOUT) { + mce_err->error_type = MCE_ERROR_TYPE_LINK; + mce_err->u.link_error_type = MCE_LINK_ERROR_LOAD_TIMEOUT; + } else if (dsisr & P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT) { + mce_err->error_type = MCE_ERROR_TYPE_LINK; + mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT; + } else if (dsisr & P9_DSISR_MC_ERAT_MULTIHIT) { + mce_err->error_type = MCE_ERROR_TYPE_ERAT; + mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + } else if (dsisr & P9_DSISR_MC_TLB_MULTIHIT_MFTLB) { + mce_err->error_type = MCE_ERROR_TYPE_TLB; + mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + } else if (dsisr & P9_DSISR_MC_USER_TLBIE) { + mce_err->error_type = MCE_ERROR_TYPE_USER; + mce_err->u.user_error_type = MCE_USER_ERROR_TLBIE; + } else if (dsisr & P9_DSISR_MC_SLB_PARITY_MFSLB) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; + } else if (dsisr & P9_DSISR_MC_SLB_MULTIHIT_MFSLB) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + } else if (dsisr & P9_DSISR_MC_RA_LOAD) { + mce_err->error_type = MCE_ERROR_TYPE_RA; + mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD; + } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK) { + mce_err->error_type = MCE_ERROR_TYPE_RA; + mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE; + } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK_FOREIGN) { + mce_err->error_type = MCE_ERROR_TYPE_RA; + mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; + } else if (dsisr & P9_DSISR_MC_RA_FOREIGN) { + mce_err->error_type = MCE_ERROR_TYPE_RA; + mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD_STORE_FOREIGN; + } +} + +static void mce_get_ierror_p9(struct pt_regs *regs, + struct mce_error_info *mce_err, uint64_t *addr) +{ + uint64_t srr1 = regs->msr; + + switch (P9_SRR1_MC_IFETCH(srr1)) { + case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE: + case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT: + mce_err->severity = MCE_SEV_FATAL; + break; + default: + mce_err->severity = MCE_SEV_ERROR_SYNC; + break; + } + + mce_err->initiator = MCE_INITIATOR_CPU; + + *addr = regs->nip; + + switch (P9_SRR1_MC_IFETCH(srr1)) { + case P9_SRR1_MC_IFETCH_UE: + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; + break; + case P9_SRR1_MC_IFETCH_SLB_PARITY: + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; + break; + case P9_SRR1_MC_IFETCH_SLB_MULTIHIT: + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + break; + case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT: + mce_err->error_type = MCE_ERROR_TYPE_ERAT; + mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + break; + case P9_SRR1_MC_IFETCH_TLB_MULTIHIT: + mce_err->error_type = MCE_ERROR_TYPE_TLB; + mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + break; + case P9_SRR1_MC_IFETCH_UE_TLB_RELOAD: + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; + break; + case P9_SRR1_MC_IFETCH_LINK_TIMEOUT: + mce_err->error_type = MCE_ERROR_TYPE_LINK; + mce_err->u.link_error_type = MCE_LINK_ERROR_IFETCH_TIMEOUT; + break; + case P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT: + mce_err->error_type = MCE_ERROR_TYPE_LINK; + mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT; + break; + case P9_SRR1_MC_IFETCH_RA: + mce_err->error_type = MCE_ERROR_TYPE_RA; + mce_err->u.ra_error_type = MCE_RA_ERROR_IFETCH; + break; + case P9_SRR1_MC_IFETCH_RA_TABLEWALK: + mce_err->error_type = MCE_ERROR_TYPE_RA; + mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH; + break; + case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE: + mce_err->error_type = MCE_ERROR_TYPE_RA; + mce_err->u.ra_error_type = MCE_RA_ERROR_STORE; + break; + case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT: + mce_err->error_type = MCE_ERROR_TYPE_LINK; + mce_err->u.link_error_type = MCE_LINK_ERROR_STORE_TIMEOUT; + break; + case P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN: + mce_err->error_type = MCE_ERROR_TYPE_RA; + mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN; + break; + default: + break; + } +} + +long __machine_check_early_realmode_p9(struct pt_regs *regs) +{ + uint64_t nip, addr; + long handled; + struct mce_error_info mce_error_info = { 0 }; + + nip = regs->nip; + + if (P9_SRR1_MC_LOADSTORE(regs->msr)) { + handled = mce_handle_derror_p9(regs); + mce_get_derror_p9(regs, &mce_error_info, &addr); + } else { + handled = mce_handle_ierror_p9(regs); + mce_get_ierror_p9(regs, &mce_error_info, &addr); + } + + /* Handle UE error. */ + if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) + handled = mce_handle_ue_error(regs); + + save_mce_event(regs, handled, &mce_error_info, nip, addr); + return handled; +} diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 595dd718ea87..2ff13249f87a 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -188,6 +188,8 @@ static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) sdsync = POWER7P_MMCRA_SDAR_VALID; else if (ppmu->flags & PPMU_ALT_SIPR) sdsync = POWER6_MMCRA_SDSYNC; + else if (ppmu->flags & PPMU_NO_SIAR) + sdsync = MMCRA_SAMPLE_ENABLE; else sdsync = MMCRA_SDSYNC; diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index e79fb5fb817d..cd951fd231c4 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -65,12 +65,41 @@ static bool is_event_valid(u64 event) return !(event & ~valid_mask); } -static u64 mmcra_sdar_mode(u64 event) +static inline bool is_event_marked(u64 event) { - if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1)) - return p9_SDAR_MODE(event) << MMCRA_SDAR_MODE_SHIFT; + if (event & EVENT_IS_MARKED) + return true; + + return false; +} - return MMCRA_SDAR_MODE_TLB; +static void mmcra_sdar_mode(u64 event, unsigned long *mmcra) +{ + /* + * MMCRA[SDAR_MODE] specifices how the SDAR should be updated in + * continous sampling mode. + * + * Incase of Power8: + * MMCRA[SDAR_MODE] will be programmed as "0b01" for continous sampling + * mode and will be un-changed when setting MMCRA[63] (Marked events). + * + * Incase of Power9: + * Marked event: MMCRA[SDAR_MODE] will be set to 0b00 ('No Updates'), + * or if group already have any marked events. + * Non-Marked events (for DD1): + * MMCRA[SDAR_MODE] will be set to 0b01 + * For rest + * MMCRA[SDAR_MODE] will be set from event code. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (is_event_marked(event) || (*mmcra & MMCRA_SAMPLE_ENABLE)) + *mmcra &= MMCRA_SDAR_MODE_NO_UPDATES; + else if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) + *mmcra |= p9_SDAR_MODE(event) << MMCRA_SDAR_MODE_SHIFT; + else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + *mmcra |= MMCRA_SDAR_MODE_TLB; + } else + *mmcra |= MMCRA_SDAR_MODE_TLB; } static u64 thresh_cmp_val(u64 value) @@ -180,7 +209,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) value |= CNST_L1_QUAL_VAL(cache); } - if (event & EVENT_IS_MARKED) { + if (is_event_marked(event)) { mask |= CNST_SAMPLE_MASK; value |= CNST_SAMPLE_VAL(event >> EVENT_SAMPLE_SHIFT); } @@ -276,7 +305,7 @@ int isa207_compute_mmcr(u64 event[], int n_ev, } /* In continuous sampling mode, update SDAR on TLB miss */ - mmcra |= mmcra_sdar_mode(event[i]); + mmcra_sdar_mode(event[i], &mmcra); if (event[i] & EVENT_IS_L1) { cache = event[i] >> EVENT_CACHE_SEL_SHIFT; @@ -285,7 +314,7 @@ int isa207_compute_mmcr(u64 event[], int n_ev, mmcr1 |= (cache & 1) << MMCR1_DC_QUAL_SHIFT; } - if (event[i] & EVENT_IS_MARKED) { + if (is_event_marked(event[i])) { mmcra |= MMCRA_SAMPLE_ENABLE; val = (event[i] >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index cf9bd8990159..899210f14ee4 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -246,6 +246,7 @@ #define MMCRA_THR_CMP_SHIFT 32 #define MMCRA_SDAR_MODE_SHIFT 42 #define MMCRA_SDAR_MODE_TLB (1ull << MMCRA_SDAR_MODE_SHIFT) +#define MMCRA_SDAR_MODE_NO_UPDATES ~(0x3ull << MMCRA_SDAR_MODE_SHIFT) #define MMCRA_IFM_SHIFT 30 /* MMCR1 Threshold Compare bit constant for power9 */ diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 86d9fde93c17..e0f856bfbfe8 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -395,7 +395,6 @@ static int opal_recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - uint64_t ea = get_mce_fault_addr(evt); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ @@ -404,26 +403,18 @@ static int opal_recover_mce(struct pt_regs *regs, } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { /* Platform corrected itself */ recovered = 1; - } else if (ea && !is_kernel_addr(ea)) { + } else if (evt->severity == MCE_SEV_FATAL) { + /* Fatal machine check */ + pr_err("Machine check interrupt is fatal\n"); + recovered = 0; + } else if ((evt->severity == MCE_SEV_ERROR_SYNC) && + (user_mode(regs) && !is_global_init(current))) { /* - * Faulting address is not in kernel text. We should be fine. - * We need to find which process uses this address. * For now, kill the task if we have received exception when * in userspace. * * TODO: Queue up this address for hwpoisioning later. */ - if (user_mode(regs) && !is_global_init(current)) { - _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); - recovered = 1; - } else - recovered = 0; - } else if (user_mode(regs) && !is_global_init(current) && - evt->severity == MCE_SEV_ERROR_SYNC) { - /* - * If we have received a synchronous error when in userspace - * kill the task. - */ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); recovered = 1; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 6901a06da2f9..e36738291c32 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1775,17 +1775,20 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev) } static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, - struct pci_bus *bus) + struct pci_bus *bus, + bool add_to_group) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); set_dma_offset(&dev->dev, pe->tce_bypass_base); - iommu_add_device(&dev->dev); + if (add_to_group) + iommu_add_device(&dev->dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate); + pnv_ioda_setup_bus_dma(pe, dev->subordinate, + add_to_group); } } @@ -2191,7 +2194,7 @@ found: set_iommu_table_base(&pe->pdev->dev, tbl); iommu_add_device(&pe->pdev->dev); } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus); + pnv_ioda_setup_bus_dma(pe, pe->pbus, true); return; fail: @@ -2426,6 +2429,8 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) pnv_pci_ioda2_set_bypass(pe, false); pnv_pci_ioda2_unset_window(&pe->table_group, 0); + if (pe->pbus) + pnv_ioda_setup_bus_dma(pe, pe->pbus, false); pnv_ioda2_table_free(tbl); } @@ -2435,6 +2440,8 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) table_group); pnv_pci_ioda2_setup_default_config(pe); + if (pe->pbus) + pnv_ioda_setup_bus_dma(pe, pe->pbus, false); } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { @@ -2624,6 +2631,9 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, level_shift = entries_shift + 3; level_shift = max_t(unsigned, level_shift, PAGE_SHIFT); + if ((level_shift - 3) * levels + page_shift >= 60) + return -EINVAL; + /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, levels, tce_table_size, &offset, &total_allocated); @@ -2728,7 +2738,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (pe->flags & PNV_IODA_PE_DEV) iommu_add_device(&pe->pdev->dev); else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus); + pnv_ioda_setup_bus_dma(pe, pe->pbus, true); } #ifdef CONFIG_PCI_MSI diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S index f9760ccf4032..3696ea6c4826 100644 --- a/arch/powerpc/purgatory/trampoline.S +++ b/arch/powerpc/purgatory/trampoline.S @@ -116,13 +116,13 @@ dt_offset: .data .balign 8 -.globl sha256_digest -sha256_digest: +.globl purgatory_sha256_digest +purgatory_sha256_digest: .skip 32 - .size sha256_digest, . - sha256_digest + .size purgatory_sha256_digest, . - purgatory_sha256_digest .balign 8 -.globl sha_regions -sha_regions: +.globl purgatory_sha_regions +purgatory_sha_regions: .skip 8 * 2 * 16 - .size sha_regions, . - sha_regions + .size purgatory_sha_regions, . - purgatory_sha_regions diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index d69ea495c4d7..716b17238599 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -474,8 +474,11 @@ static int ctr_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier, ret = blkcipher_walk_done(desc, walk, nbytes - n); } if (k < n) { - if (__ctr_paes_set_key(ctx) != 0) + if (__ctr_paes_set_key(ctx) != 0) { + if (locked) + spin_unlock(&ctrblk_lock); return blkcipher_walk_done(desc, walk, -EIO); + } } } if (locked) diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index d1c407ddf703..9072bf63a846 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -8,31 +8,27 @@ #define _S390_CPUTIME_H #include <linux/types.h> -#include <asm/div64.h> +#include <asm/timex.h> #define CPUTIME_PER_USEC 4096ULL #define CPUTIME_PER_SEC (CPUTIME_PER_USEC * USEC_PER_SEC) /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ -typedef unsigned long long __nocast cputime_t; -typedef unsigned long long __nocast cputime64_t; - #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) -static inline unsigned long __div(unsigned long long n, unsigned long base) -{ - return n / base; -} - /* - * Convert cputime to microseconds and back. + * Convert cputime to microseconds. */ -static inline unsigned int cputime_to_usecs(const cputime_t cputime) +static inline u64 cputime_to_usecs(const u64 cputime) { - return (__force unsigned long long) cputime >> 12; + return cputime >> 12; } +/* + * Convert cputime to nanoseconds. + */ +#define cputime_to_nsecs(cputime) tod_to_ns(cputime) u64 arch_cpu_idle_time(int cpu); diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 6e31d87fb669..fa2bf69be182 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -156,10 +156,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 354344dcc198..118535123f34 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -206,20 +206,16 @@ static inline unsigned long long get_tod_clock_monotonic(void) * ns = (todval * 125) >> 9; * * In order to avoid an overflow with the multiplication we can rewrite this. - * With a split todval == 2^32 * th + tl (th upper 32 bits, tl lower 32 bits) + * With a split todval == 2^9 * th + tl (th upper 55 bits, tl lower 9 bits) * we end up with * - * ns = ((2^32 * th + tl) * 125 ) >> 9; - * -> ns = (2^23 * th * 125) + ((tl * 125) >> 9); + * ns = ((2^9 * th + tl) * 125 ) >> 9; + * -> ns = (th * 125) + ((tl * 125) >> 9); * */ static inline unsigned long long tod_to_ns(unsigned long long todval) { - unsigned long long ns; - - ns = ((todval >> 32) << 23) * 125; - ns += ((todval & 0xffffffff) * 125) >> 9; - return ns; + return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } #endif diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index 4384bc797a54..152de9b796e1 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -313,7 +313,9 @@ #define __NR_copy_file_range 375 #define __NR_preadv2 376 #define __NR_pwritev2 377 -#define NR_syscalls 378 +/* Number 378 is reserved for guarded storage */ +#define __NR_statx 379 +#define NR_syscalls 380 /* * There are some system calls that are not present on 64 bit, some diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c index ae2cda5eee5a..e89cc2e71db1 100644 --- a/arch/s390/kernel/compat_wrapper.c +++ b/arch/s390/kernel/compat_wrapper.c @@ -178,3 +178,4 @@ COMPAT_SYSCALL_WRAP3(getpeername, int, fd, struct sockaddr __user *, usockaddr, COMPAT_SYSCALL_WRAP6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len); COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags); COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags); +COMPAT_SYSCALL_WRAP5(statx, int, dfd, const char __user *, path, unsigned, flags, unsigned, mask, struct statx __user *, buffer); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index dff2152350a7..6a7d737d514c 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -490,7 +490,7 @@ ENTRY(pgm_check_handler) jnz .Lpgm_svcper # -> single stepped svc 1: CHECK_STACK STACK_SIZE,__LC_SAVE_AREA_SYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - j 3f + j 4f 2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER lg %r15,__LC_KERNEL_STACK lgr %r14,%r12 @@ -499,8 +499,8 @@ ENTRY(pgm_check_handler) tm __LC_PGM_ILC+2,0x02 # check for transaction abort jz 3f mvc __THREAD_trap_tdb(256,%r14),0(%r13) -3: la %r11,STACK_FRAME_OVERHEAD(%r15) - stg %r10,__THREAD_last_break(%r14) +3: stg %r10,__THREAD_last_break(%r14) +4: la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC stmg %r8,%r9,__PT_PSW(%r11) @@ -509,14 +509,14 @@ ENTRY(pgm_check_handler) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) stg %r10,__PT_ARGS(%r11) tm __LC_PGM_ILC+3,0x80 # check for per exception - jz 4f + jz 5f tmhh %r8,0x0001 # kernel per event ? jz .Lpgm_kprobe oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID -4: REENABLE_IRQS +5: REENABLE_IRQS xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index b67dafb7b7cf..e545ffe5155a 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -564,6 +564,8 @@ static struct kset *ipl_kset; static void __ipl_run(void *unused) { + if (MACHINE_IS_LPAR && ipl_info.type == IPL_TYPE_CCW) + diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); diag308(DIAG308_LOAD_CLEAR, NULL); if (MACHINE_IS_VM) __cpcmd("IPL", NULL, 0, NULL); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 20cd339e11ae..f29e41c5e2ec 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -124,7 +124,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp, clear_tsk_thread_flag(p, TIF_SINGLE_STEP); /* Initialize per thread user and system timer values */ p->thread.user_timer = 0; + p->thread.guest_timer = 0; p->thread.system_timer = 0; + p->thread.hardirq_timer = 0; + p->thread.softirq_timer = 0; frame->sf.back_chain = 0; /* new return point is ret_from_fork */ diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 9b59e6212d8f..2659b5cfeddb 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -386,3 +386,5 @@ SYSCALL(sys_mlock2,compat_sys_mlock2) SYSCALL(sys_copy_file_range,compat_sys_copy_file_range) /* 375 */ SYSCALL(sys_preadv2,compat_sys_preadv2) SYSCALL(sys_pwritev2,compat_sys_pwritev2) +NI_SYSCALL +SYSCALL(sys_statx,compat_sys_statx) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index c14fc9029912..072d84ba42a3 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -111,7 +111,7 @@ static inline u64 scale_vtime(u64 vtime) } static void account_system_index_scaled(struct task_struct *p, - cputime_t cputime, cputime_t scaled, + u64 cputime, u64 scaled, enum cpu_usage_stat index) { p->stimescaled += cputime_to_nsecs(scaled); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index b48dc5f1900b..463e5ef02304 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -608,12 +608,29 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) { spinlock_t *ptl; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; pgste_t pgste; pte_t *ptep; pte_t pte; bool dirty; - ptep = get_locked_pte(mm, addr, &ptl); + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return false; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return false; + /* We can't run guests backed by huge pages, but userspace can + * still set them up and then try to migrate them without any + * migration support. + */ + if (pmd_large(*pmd)) + return true; + + ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (unlikely(!ptep)) return false; diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c index e359ec675869..12daf45369b4 100644 --- a/arch/score/kernel/traps.c +++ b/arch/score/kernel/traps.c @@ -24,6 +24,7 @@ */ #include <linux/extable.h> +#include <linux/ptrace.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> diff --git a/arch/score/mm/extable.c b/arch/score/mm/extable.c index ec871355fc2d..6736a3ad6286 100644 --- a/arch/score/mm/extable.c +++ b/arch/score/mm/extable.c @@ -24,6 +24,8 @@ */ #include <linux/extable.h> +#include <linux/ptrace.h> +#include <asm/extable.h> int fixup_exception(struct pt_regs *regs) { diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index 94ac2739918c..b668e351fd6c 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -37,12 +37,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return true; } -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} - /* * end asm-generic/mm_hooks.h functions */ diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index de5d572225f3..cd1fa97776c3 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -302,8 +302,8 @@ extern int ignore_sigio_fd(int fd); extern void maybe_sigio_broken(int fd, int read); extern void sigio_broken(int fd, int read); -/* sys-x86_64/prctl.c */ -extern int os_arch_prctl(int pid, int code, unsigned long *addr); +/* prctl.c */ +extern int os_arch_prctl(int pid, int option, unsigned long *arg2); /* tty.c */ extern int get_pty(void); diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index 62dfc644c908..59b06b48f27d 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -103,10 +103,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc98d5a294ee..8977d9c77373 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -106,6 +106,7 @@ config X86 select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT + select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE @@ -2787,6 +2788,9 @@ config X86_DMA_REMAP bool depends on STA2X11 +config HAVE_GENERIC_GUP + def_bool y + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9ba050fe47f3..0af59fa789ea 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -390,3 +390,4 @@ 381 i386 pkey_alloc sys_pkey_alloc 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx +384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 226ca70dc6bd..5c5d4d7618e6 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -354,7 +354,7 @@ static void vgetcpu_cpu_init(void *arg) d.p = 1; /* Present */ d.d = 1; /* 32-bit */ - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } static int vgetcpu_online(unsigned int cpu) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b04bb6dfed7f..0fe00446f9ca 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -187,6 +187,7 @@ * Reuse free bits when adding new feature flags! */ #define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 1548ca92ad3f..d0a21b12dd58 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -4,6 +4,7 @@ #include <asm/desc_defs.h> #include <asm/ldt.h> #include <asm/mmu.h> +#include <asm/fixmap.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -45,11 +46,43 @@ struct gdt_page { DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +/* Provide the original GDT */ +static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) { return per_cpu(gdt_page, cpu).gdt; } +/* Provide the current original GDT */ +static inline struct desc_struct *get_current_gdt_rw(void) +{ + return this_cpu_ptr(&gdt_page)->gdt; +} + +/* Get the fixmap index for a specific processor */ +static inline unsigned int get_cpu_gdt_ro_index(int cpu) +{ + return FIX_GDT_REMAP_BEGIN + cpu; +} + +/* Provide the fixmap address of the remapped GDT */ +static inline struct desc_struct *get_cpu_gdt_ro(int cpu) +{ + unsigned int idx = get_cpu_gdt_ro_index(cpu); + return (struct desc_struct *)__fix_to_virt(idx); +} + +/* Provide the current read-only GDT */ +static inline struct desc_struct *get_current_gdt_ro(void) +{ + return get_cpu_gdt_ro(smp_processor_id()); +} + +/* Provide the physical address of the GDT page. */ +static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) +{ + return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); +} + #ifdef CONFIG_X86_64 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, @@ -174,7 +207,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) { - struct desc_struct *d = get_cpu_gdt_table(cpu); + struct desc_struct *d = get_cpu_gdt_rw(cpu); tss_desc tss; set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, @@ -194,22 +227,90 @@ static inline void native_set_ldt(const void *addr, unsigned int entries) set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, entries * LDT_ENTRY_SIZE - 1); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT, &ldt, DESC_LDT); asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); } } +static inline void native_load_gdt(const struct desc_ptr *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct desc_ptr *dtr) +{ + asm volatile("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct desc_ptr *dtr) +{ + asm volatile("sidt %0":"=m" (*dtr)); +} + +/* + * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is + * a read-only remapping. To prevent a page fault, the GDT is switched to the + * original writeable version when needed. + */ +#ifdef CONFIG_X86_64 +static inline void native_load_tr_desc(void) +{ + struct desc_ptr gdt; + int cpu = raw_smp_processor_id(); + bool restore = 0; + struct desc_struct *fixmap_gdt; + + native_store_gdt(&gdt); + fixmap_gdt = get_cpu_gdt_ro(cpu); + + /* + * If the current GDT is the read-only fixmap, swap to the original + * writeable version. Swap back at the end. + */ + if (gdt.address == (unsigned long)fixmap_gdt) { + load_direct_gdt(cpu); + restore = 1; + } + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); + if (restore) + load_fixmap_gdt(cpu); +} +#else static inline void native_load_tr_desc(void) { asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } +#endif + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + + asm volatile("str %0":"=r" (tr)); + + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_rw(cpu); + unsigned int i; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} DECLARE_PER_CPU(bool, __tss_limit_invalid); static inline void force_reload_TR(void) { - struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); + struct desc_struct *d = get_current_gdt_rw(); tss_desc tss; memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); @@ -257,44 +358,6 @@ static inline void invalidate_tss_limit(void) this_cpu_write(__tss_limit_invalid, true); } -static inline void native_load_gdt(const struct desc_ptr *dtr) -{ - asm volatile("lgdt %0"::"m" (*dtr)); -} - -static inline void native_load_idt(const struct desc_ptr *dtr) -{ - asm volatile("lidt %0"::"m" (*dtr)); -} - -static inline void native_store_gdt(struct desc_ptr *dtr) -{ - asm volatile("sgdt %0":"=m" (*dtr)); -} - -static inline void native_store_idt(struct desc_ptr *dtr) -{ - asm volatile("sidt %0":"=m" (*dtr)); -} - -static inline unsigned long native_store_tr(void) -{ - unsigned long tr; - - asm volatile("str %0":"=r" (tr)); - - return tr; -} - -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) -{ - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - unsigned int i; - - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) - gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; -} - /* This intentionally ignores lm, since 32-bit apps don't have that field. */ #define LDT_empty(info) \ ((info)->base_addr == 0 && \ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9d49c18b5ea9..d4d3ed456cb7 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -293,8 +293,23 @@ do { \ } \ } while (0) +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static inline int mmap_is_ia32(void) +{ + return IS_ENABLED(CONFIG_X86_32) || + (IS_ENABLED(CONFIG_COMPAT) && + test_thread_flag(TIF_ADDR32)); +} + +extern unsigned long tasksize_32bit(void); +extern unsigned long tasksize_64bit(void); +extern unsigned long get_mmap_base(int is_legacy); + #ifdef CONFIG_X86_32 +#define __STACK_RND_MASK(is32bit) (0x7ff) #define STACK_RND_MASK (0x7ff) #define ARCH_DLINFO ARCH_DLINFO_IA32 @@ -304,7 +319,8 @@ do { \ #else /* CONFIG_X86_32 */ /* 1GB for 64bit, 8MB for 32bit */ -#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) +#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) +#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) #define ARCH_DLINFO \ do { \ @@ -348,16 +364,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages -/* - * True on X86_32 or when emulating IA32 on X86_64 - */ -static inline int mmap_is_ia32(void) -{ - return IS_ENABLED(CONFIG_X86_32) || - (IS_ENABLED(CONFIG_COMPAT) && - test_thread_flag(TIF_ADDR32)); -} - /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8554f960e21b..b65155cc3760 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -100,6 +100,10 @@ enum fixed_addresses { #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif + /* Fixmap entries to remap the GDTs, one per processor. */ + FIX_GDT_REMAP_BEGIN, + FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, + __end_of_permanent_fixed_addresses, /* diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 282630e4c6ea..70ef205489f0 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -164,6 +164,7 @@ struct kimage_arch { }; #else struct kimage_arch { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 306c7e12af55..6e933d2d88d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -220,18 +220,6 @@ static inline int vma_pkey(struct vm_area_struct *vma) } #endif -static inline bool __pkru_allows_pkey(u16 pkey, bool write) -{ - u32 pkru = read_pkru(); - - if (!__pkru_allows_read(pkru, pkey)) - return false; - if (write && !__pkru_allows_write(pkru, pkey)) - return false; - - return true; -} - /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other @@ -268,8 +256,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); -} #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d8b5f8ab8ef9..673f9ac50f6d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -45,6 +45,8 @@ #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd #define MSR_PLATFORM_INFO 0x000000ce +#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 +#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) @@ -127,6 +129,7 @@ /* DEBUGCTLMSR bits (others vary by model): */ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) @@ -552,10 +555,12 @@ #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT) -/* MISC_FEATURE_ENABLES non-architectural features */ -#define MSR_MISC_FEATURE_ENABLES 0x00000140 +/* MISC_FEATURES_ENABLES non-architectural features */ +#define MSR_MISC_FEATURES_ENABLES 0x00000140 -#define MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT 1 +#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT 0 +#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT) +#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT 1 #define MSR_IA32_TSC_DEADLINE 0x000006E0 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 0489884fdc44..158d877ce9e9 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -536,7 +536,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud) PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, val); } -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 static inline pud_t __pud(pudval_t val) { pudval_t ret; @@ -565,6 +565,32 @@ static inline pudval_t pud_val(pud_t pud) return ret; } +static inline void pud_clear(pud_t *pudp) +{ + set_pud(pudp, __pud(0)); +} + +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + p4dval_t val = native_p4d_val(p4d); + + if (sizeof(p4dval_t) > sizeof(long)) + PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp, + val, (u64)val >> 32); + else + PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp, + val); +} + +static inline void p4d_clear(p4d_t *p4dp) +{ + set_p4d(p4dp, __p4d(0)); +} + +#if CONFIG_PGTABLE_LEVELS >= 5 + +#error FIXME + static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { pgdval_t val = native_pgd_val(pgd); @@ -582,10 +608,7 @@ static inline void pgd_clear(pgd_t *pgdp) set_pgd(pgdp, __pgd(0)); } -static inline void pud_clear(pud_t *pudp) -{ - set_pud(pudp, __pud(0)); -} +#endif /* CONFIG_PGTABLE_LEVELS == 5 */ #endif /* CONFIG_PGTABLE_LEVELS == 4 */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b060f962d581..93c49cf09b63 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -279,12 +279,18 @@ struct pv_mmu_ops { struct paravirt_callee_save pmd_val; struct paravirt_callee_save make_pmd; -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 struct paravirt_callee_save pud_val; struct paravirt_callee_save make_pud; - void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ + void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval); + +#if CONFIG_PGTABLE_LEVELS >= 5 +#error FIXME +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ + +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ + #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ struct pv_lazy_ops lazy_mode; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index b6d425999f99..2f585054c63c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -121,10 +121,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ #if CONFIG_PGTABLE_LEVELS > 3 -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -150,6 +150,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, ___pud_free_tlb(tlb, pud); } +#if CONFIG_PGTABLE_LEVELS > 4 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) +{ + paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); +} + +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + return (p4d_t *)get_zeroed_page(gfp); +} + +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + free_page((unsigned long)p4d); +} + +extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); + +static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long address) +{ + ___p4d_free_tlb(tlb, p4d); +} + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 392576433e77..373ab1de909f 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -7,6 +7,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 72277b1028a5..29eb5778019c 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -215,4 +215,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) +#define gup_get_pte gup_get_pte +/* + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atomically, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a PTE will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' iff pte_high + * sees 'h'. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have gotten rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. Because get_user_pages_fast() only operates on present ptes + * we're safe. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + pte_t pte; + + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); + + return pte; +} + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index bcc89625ebe5..b8a4341faafa 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -7,6 +7,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cfb36b8c024..bf51e6054577 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -53,11 +53,19 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) -#ifndef __PAGETABLE_PUD_FOLDED +#ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) #define pgd_clear(pgd) native_pgd_clear(pgd) #endif +#ifndef set_p4d +# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d) +#endif + +#ifndef __PAGETABLE_PUD_FOLDED +#define p4d_clear(p4d) native_p4d_clear(p4d) +#endif + #ifndef set_pud # define set_pud(pudp, pud) native_set_pud(pudp, pud) #endif @@ -74,6 +82,11 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) +#ifndef __PAGETABLE_P4D_FOLDED +#define p4d_val(x) native_p4d_val(x) +#define __p4d(x) native_make_p4d(x) +#endif + #ifndef __PAGETABLE_PUD_FOLDED #define pud_val(x) native_pud_val(x) #define __pud(x) native_make_pud(x) @@ -179,6 +192,17 @@ static inline unsigned long pud_pfn(pud_t pud) return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } +static inline unsigned long p4d_pfn(p4d_t p4d) +{ + return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; +} + +static inline int p4d_large(p4d_t p4d) +{ + /* No 512 GiB pages yet */ + return 0; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -222,6 +246,11 @@ static inline int pud_devmap(pud_t pud) return 0; } #endif + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -538,6 +567,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define pte_pgprot(x) __pgprot(pte_flags(x)) #define pmd_pgprot(x) __pgprot(pmd_flags(x)) #define pud_pgprot(x) __pgprot(pud_flags(x)) +#define p4d_pgprot(x) __pgprot(p4d_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) @@ -587,6 +617,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/log2.h> +#include <asm/fixmap.h> static inline int pte_none(pte_t pte) { @@ -770,7 +801,52 @@ static inline int pud_large(pud_t pud) } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + #if CONFIG_PGTABLE_LEVELS > 3 +static inline int p4d_none(p4d_t p4d) +{ + return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + return p4d_flags(p4d) & _PAGE_PRESENT; +} + +static inline unsigned long p4d_page_vaddr(p4d_t p4d) +{ + return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define p4d_page(p4d) \ + pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) + +/* Find an entry in the third-level page table.. */ +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address); +} + +static inline int p4d_bad(p4d_t p4d) +{ + return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; +} +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ + +static inline unsigned long p4d_index(unsigned long address) +{ + return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); +} + +#if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { return pgd_flags(pgd) & _PAGE_PRESENT; @@ -788,14 +864,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned long pud_index(unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { - return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); -} - -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) -{ - return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); + return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } static inline int pgd_bad(pgd_t pgd) @@ -813,7 +884,7 @@ static inline int pgd_none(pgd_t pgd) */ return !native_pgd_val(pgd); } -#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* __ASSEMBLY__ */ @@ -1120,6 +1191,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) #endif } +static inline bool __pkru_allows_pkey(u16 pkey, bool write) +{ + u32 pkru = read_pkru(); + + if (!__pkru_allows_read(pkru, pkey)) + return false; + if (write && !__pkru_allows_write(pkru, pkey)) + return false; + + return true; +} + +/* + * 'pteval' can come from a PTE, PMD or PUD. We only check + * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the + * same value on all 3 types. + */ +static inline bool __pte_access_permitted(unsigned long pteval, bool write) +{ + unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + + if (write) + need_pte_bits |= _PAGE_RW; + + if ((pteval & need_pte_bits) != need_pte_bits) + return 0; + + return __pkru_allows_pkey(pte_flags_pkey(pteval), write); +} + +#define pte_access_permitted pte_access_permitted +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + return __pte_access_permitted(pte_val(pte), write); +} + +#define pmd_access_permitted pmd_access_permitted +static inline bool pmd_access_permitted(pmd_t pmd, bool write) +{ + return __pte_access_permitted(pmd_val(pmd), write); +} + +#define pud_access_permitted pud_access_permitted +static inline bool pud_access_permitted(pud_t pud, bool write) +{ + return __pte_access_permitted(pud_val(pud), write); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index fbc73360aea0..bfab55675c16 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -14,7 +14,6 @@ */ #ifndef __ASSEMBLY__ #include <asm/processor.h> -#include <asm/fixmap.h> #include <linux/threads.h> #include <asm/paravirt.h> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 73c7ccc38912..0593a1ae7573 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -41,9 +41,9 @@ extern void paging_init(void); struct mm_struct; +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); - static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -121,6 +121,16 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } +static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + *p4dp = p4d; +} + +static inline void native_p4d_clear(p4d_t *p4d) +{ + native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)}); +} + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { *pgdp = pgd; @@ -206,6 +216,20 @@ extern void cleanup_highmap(void); extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); -#endif /* !__ASSEMBLY__ */ +#define gup_fast_permitted gup_fast_permitted +static inline bool gup_fast_permitted(unsigned long start, int nr_pages, + int write) +{ + unsigned long len, end; + + len = (unsigned long)nr_pages << PAGE_SHIFT; + end = start + len; + if (end < start) + return false; + if (end >> __VIRTUAL_MASK_SHIFT) + return false; + return true; +} +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3a264200c62f..516593e66bd6 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -13,6 +13,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; @@ -67,7 +68,8 @@ typedef struct { pteval_t pte; } pte_t; #endif /* CONFIG_RANDOMIZE_MEMORY */ #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) -#define MODULES_END _AC(0xffffffffff000000, UL) +/* The module sections ends with the start of the fixmap */ +#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 62484333673d..4930afe9df0a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -272,9 +272,20 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } -#if CONFIG_PGTABLE_LEVELS > 3 -#include <asm-generic/5level-fixup.h> +#if CONFIG_PGTABLE_LEVELS > 4 + +#error FIXME +#else +#include <asm-generic/pgtable-nop4d.h> + +static inline p4dval_t native_p4d_val(p4d_t p4d) +{ + return native_pgd_val(p4d.pgd); +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -287,12 +298,11 @@ static inline pudval_t native_pud_val(pud_t pud) return pud.pud; } #else -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopud.h> static inline pudval_t native_pud_val(pud_t pud) { - return native_pgd_val(pud.pgd); + return native_pgd_val(pud.p4d.pgd); } #endif @@ -309,15 +319,30 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) return pmd.pmd; } #else -#define __ARCH_USE_5LEVEL_HACK #include <asm-generic/pgtable-nopmd.h> static inline pmdval_t native_pmd_val(pmd_t pmd) { - return native_pgd_val(pmd.pud.pgd); + return native_pgd_val(pmd.pud.p4d.pgd); } #endif +static inline p4dval_t p4d_pfn_mask(p4d_t p4d) +{ + /* No 512 GiB huge pages yet */ + return PTE_PFN_MASK; +} + +static inline p4dval_t p4d_flags_mask(p4d_t p4d) +{ + return ~p4d_pfn_mask(p4d); +} + +static inline p4dval_t p4d_flags(p4d_t p4d) +{ + return native_p4d_val(p4d) & p4d_flags_mask(p4d); +} + static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) @@ -461,6 +486,7 @@ enum pg_level { PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, + PG_LEVEL_512G, PG_LEVEL_NUM }; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4aa93b560a2b..3cada998a402 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -709,6 +709,8 @@ extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); extern void switch_to_new_gdt(int); +extern void load_direct_gdt(int); +extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); @@ -790,6 +792,7 @@ static inline void spin_lock_prefetch(const void *x) /* * User space process size: 3GB (default). */ +#define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET #define TASK_SIZE_MAX TASK_SIZE #define STACK_TOP TASK_SIZE @@ -866,7 +869,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) #define KSTK_EIP(task) (task_pt_regs(task)->ip) @@ -877,6 +881,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +DECLARE_PER_CPU(u64, msr_misc_features_shadow); + /* Register/unregister a process' MPX related resource */ #define MPX_ENABLE_MANAGEMENT() mpx_enable_management() #define MPX_DISABLE_MANAGEMENT() mpx_disable_management() diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 9b9b30b19441..8d3964fc5f91 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -9,6 +9,7 @@ void syscall_init(void); #ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif #ifdef CONFIG_X86_32 @@ -30,6 +31,7 @@ void x86_report_nx(void); extern int reboot_force; -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h new file mode 100644 index 000000000000..d7da2729903d --- /dev/null +++ b/arch/x86/include/asm/purgatory.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_PURGATORY_H +#define _ASM_X86_PURGATORY_H + +#ifndef __ASSEMBLY__ +#include <linux/purgatory.h> + +extern void purgatory(void); +/* + * These forward declarations serve two purposes: + * + * 1) Make sparse happy when checking arch/purgatory + * 2) Document that these are required to be global so the symbol + * lookup in kexec works + */ +extern unsigned long purgatory_backup_dest; +extern unsigned long purgatory_backup_src; +extern unsigned long purgatory_backup_sz; +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_PURGATORY_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 58505f01962f..dcbd9bcce714 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu) { #ifdef CONFIG_X86_32 unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); - struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); + struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu); struct desc_struct desc; desc = gdt_table[GDT_ENTRY_STACK_CANARY]; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad6f5eb07a95..9fc44b95f7cb 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -87,6 +87,7 @@ struct thread_info { #define TIF_SECCOMP 8 /* secure computing */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ +#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ @@ -110,6 +111,7 @@ struct thread_info { #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_NOHZ (1 << TIF_NOHZ) @@ -138,7 +140,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) + (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) @@ -239,6 +241,8 @@ static inline int arch_within_stack_frames(const void * const stack, extern void arch_task_cache_init(void); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); extern void arch_release_task_struct(struct task_struct *tsk); +extern void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6fa85944af83..75d002bdb3f3 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -110,6 +110,16 @@ static inline void cr4_clear_bits(unsigned long mask) } } +static inline void cr4_toggle_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + cr4 ^= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); +} + /* Read the CR4 shadow. */ static inline unsigned long cr4_read_shadow(void) { @@ -188,7 +198,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (static_cpu_has(X86_FEATURE_PGE)) + if (boot_cpu_has(X86_FEATURE_PGE)) __flush_tlb_global(); else __flush_tlb(); diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 33cbd3db97b9..bf2ca56fba11 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -279,13 +279,17 @@ static inline pte_t __pte_ma(pteval_t x) #define pmd_val_ma(v) ((v).pmd) #ifdef __PAGETABLE_PUD_FOLDED -#define pud_val_ma(v) ((v).pgd.pgd) +#define pud_val_ma(v) ((v).p4d.pgd.pgd) #else #define pud_val_ma(v) ((v).pud) #endif #define __pmd_ma(x) ((pmd_t) { (x) } ) -#define pgd_val_ma(x) ((x).pgd) +#ifdef __PAGETABLE_P4D_FOLDED +#define p4d_val_ma(x) ((x).pgd.pgd) +#else +#define p4d_val_ma(x) ((x).p4d) +#endif void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 835aa51c7f6e..c45765517092 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -1,10 +1,13 @@ #ifndef _ASM_X86_PRCTL_H #define _ASM_X86_PRCTL_H -#define ARCH_SET_GS 0x1001 -#define ARCH_SET_FS 0x1002 -#define ARCH_GET_FS 0x1003 -#define ARCH_GET_GS 0x1004 +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 + +#define ARCH_GET_CPUID 0x1011 +#define ARCH_SET_CPUID 0x1012 #define ARCH_MAP_VDSO_X32 0x2001 #define ARCH_MAP_VDSO_32 0x2002 diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 48587335ede8..ed014814ea35 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void) #ifdef CONFIG_SMP initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = - (unsigned long)get_cpu_gdt_table(smp_processor_id()); + (unsigned long)get_cpu_gdt_rw(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5a414545e8a3..446b0d3d4932 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -2352,7 +2352,7 @@ static int __init apm_init(void) * Note we only set APM segments on CPU zero, since we pin the APM * code to that CPU. */ - gdt = get_cpu_gdt_table(0); + gdt = get_cpu_gdt_rw(0); set_desc_base(&gdt[APM_CS >> 3], (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); set_desc_base(&gdt[APM_CS_16 >> 3], diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 58094a1f9e9d..8ee32119144d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -448,19 +448,60 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } +/* Setup the fixmap mapping only once per-processor */ +static inline void setup_fixmap_gdt(int cpu) +{ +#ifdef CONFIG_X86_64 + /* On 64-bit systems, we use a read-only fixmap GDT. */ + pgprot_t prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the GDT + * is read-only, that will triple fault. + * + * On Xen PV, the GDT must be read-only because the hypervisor requires + * it. + */ + pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; +#endif + + __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); +} + +/* Load the original GDT from the per-cpu structure */ +void load_direct_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_direct_gdt); + +/* Load a fixmap remapping of the per-cpu GDT */ +void load_fixmap_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_ro(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_fixmap_gdt); + /* * Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. */ void switch_to_new_gdt(int cpu) { - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(cpu); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); + /* Load the original GDT */ + load_direct_gdt(cpu); /* Reload the per-cpu base */ - load_percpu_segment(cpu); } @@ -1526,6 +1567,9 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #else @@ -1581,6 +1625,9 @@ void cpu_init(void) dbg_restore_debug_regs(); fpu__init_cpu(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 063197771b8d..dfa90a3a5145 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) return; } - if (ring3mwait_disabled) { - msr_clear_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); + if (ring3mwait_disabled) return; - } - - msr_set_bit(MSR_MISC_FEATURE_ENABLES, - MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT); set_cpu_cap(c, X86_FEATURE_RING3MWAIT); + this_cpu_or(msr_misc_features_shadow, + 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT); if (c == &boot_cpu_data) ELF_HWCAP2 |= HWCAP2_RING3MWAIT; @@ -488,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c) init_intel_energy_perf(c); } +static void init_cpuid_fault(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); + } +} + +static void init_intel_misc_features(struct cpuinfo_x86 *c) +{ + u64 msr; + + if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + return; + + /* Clear all MISC features */ + this_cpu_write(msr_misc_features_shadow, 0); + + /* Check features and update capabilities and shadow control bits */ + init_cpuid_fault(c); + probe_xeon_phi_r3mwait(c); + + msr = this_cpu_read(msr_misc_features_shadow); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); +} + static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -602,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_energy_perf(c); - probe_xeon_phi_r3mwait(c); + init_intel_misc_features(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 469b23d6acc2..5f43cec296c5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one( pgd_t *pgd, pmd_t *pmd, pte_t *pte, unsigned long vaddr, unsigned long paddr) { + p4d_t *p4d; pud_t *pud; pgd += pgd_index(vaddr); @@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one( if (!(pgd_val(*pgd) & _PAGE_PRESENT)) set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); #endif - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); if (!(pmd_val(*pmd) & _PAGE_PRESENT)) set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 307b1f4543de..085c3b300d32 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -36,6 +36,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = { static void free_transition_pgtable(struct kimage *image) { + free_page((unsigned long)image->arch.p4d); free_page((unsigned long)image->arch.pud); free_page((unsigned long)image->arch.pmd); free_page((unsigned long)image->arch.pte); @@ -43,6 +44,7 @@ static void free_transition_pgtable(struct kimage *image) static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -53,13 +55,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + goto err; + image->arch.p4d = p4d; + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) goto err; image->arch.pud = pud; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } - pud = pud_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); if (!pud_present(*pud)) { pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) @@ -194,19 +204,22 @@ static int arch_update_purgatory(struct kimage *image) /* Setup copying of backup region */ if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_dest", &image->arch.backup_load_addr, sizeof(image->arch.backup_load_addr), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_src", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_src", &image->arch.backup_src_start, sizeof(image->arch.backup_src_start), 0); if (ret) return ret; - ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + ret = kexec_purgatory_get_set_symbol(image, + "purgatory_backup_sz", &image->arch.backup_src_sz, sizeof(image->arch.backup_src_sz), 0); if (ret) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 4797e87b0fb6..110daf22f5c7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -430,12 +430,16 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 .pud_val = PTE_IDENT, .make_pud = PTE_IDENT, - .set_pgd = native_set_pgd, -#endif + .set_p4d = native_set_p4d, + +#if CONFIG_PGTABLE_LEVELS >= 5 +#error FIXME +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .pte_val = PTE_IDENT, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f67591561711..0bb88428cbf2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,6 +37,7 @@ #include <asm/vm86.h> #include <asm/switch_to.h> #include <asm/desc.h> +#include <asm/prctl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -124,11 +125,6 @@ void flush_thread(void) fpu__clear(&tsk->thread.fpu); } -static void hard_disable_TSC(void) -{ - cr4_set_bits(X86_CR4_TSD); -} - void disable_TSC(void) { preempt_disable(); @@ -137,15 +133,10 @@ void disable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_disable_TSC(); + cr4_set_bits(X86_CR4_TSD); preempt_enable(); } -static void hard_enable_TSC(void) -{ - cr4_clear_bits(X86_CR4_TSD); -} - static void enable_TSC(void) { preempt_disable(); @@ -154,7 +145,7 @@ static void enable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_enable_TSC(); + cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } @@ -182,54 +173,129 @@ int set_tsc_mode(unsigned int val) return 0; } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; +DEFINE_PER_CPU(u64, msr_misc_features_shadow); - if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ - test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); +static void set_cpuid_faulting(bool on) +{ + u64 msrval; - debugctl &= ~DEBUGCTLMSR_BTF; - if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) - debugctl |= DEBUGCTLMSR_BTF; + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); +} - update_debugctlmsr(debugctl); +static void disable_cpuid(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(true); } + preempt_enable(); +} - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); +static void enable_cpuid(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOCPUID)) { + /* + * Must flip the CPU state synchronously with + * TIF_NOCPUID in the current running context. + */ + set_cpuid_faulting(false); } + preempt_enable(); +} + +static int get_cpuid_mode(void) +{ + return !test_thread_flag(TIF_NOCPUID); +} + +static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +{ + if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + return -ENODEV; + + if (cpuid_enabled) + enable_cpuid(); + else + disable_cpuid(); + + return 0; +} + +/* + * Called immediately after a successful exec. + */ +void arch_setup_new_exec(void) +{ + /* If cpuid was previously disabled for this task, re-enable it. */ + if (test_thread_flag(TIF_NOCPUID)) + enable_cpuid(); +} - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +static inline void switch_to_bitmap(struct tss_struct *tss, + struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) +{ + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. * Normally this is 128 bytes or less: */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); - /* * Make sure that the TSS limit is correct for the CPU * to notice the IO bitmap. */ refresh_tss_limit(); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + } else if (tifp & _TIF_IO_BITMAP) { /* * Clear any possible leftover bits: */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long tifp, tifn; + + prev = &prev_p->thread; + next = &next_p->thread; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + switch_to_bitmap(tss, prev, next, tifp, tifn); + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_NOCPUID) + set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); } /* @@ -550,3 +616,16 @@ out: put_task_stack(p); return ret; } + +long do_arch_prctl_common(struct task_struct *task, int option, + unsigned long cpuid_enabled) +{ + switch (option) { + case ARCH_GET_CPUID: + return get_cpuid_mode(); + case ARCH_SET_CPUID: + return set_cpuid_mode(task, cpuid_enabled); + } + + return -EINVAL; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4c818f8bc135..ff40e74c9181 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/kdebug.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/ldt.h> @@ -56,6 +57,7 @@ #include <asm/switch_to.h> #include <asm/vm86.h> #include <asm/intel_rdt.h> +#include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) { @@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return do_arch_prctl_common(current, option, arg2); +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d6b784a5520d..ea1a6180bf39 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> +#include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -204,7 +205,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, tls); + err = do_arch_prctl_64(p, ARCH_SET_FS, tls); if (err) goto out; } @@ -547,70 +548,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) } #endif -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (option) { case ARCH_SET_GS: - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.gsindex = 0; - task->thread.gsbase = addr; + task->thread.gsbase = arg2; if (doit) { load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); } put_cpu(); break; case ARCH_SET_FS: /* Not strictly needed for fs, but do it for symmetry with gs */ - if (addr >= TASK_SIZE_MAX) + if (arg2 >= TASK_SIZE_MAX) return -EPERM; cpu = get_cpu(); task->thread.fsindex = 0; - task->thread.fsbase = addr; + task->thread.fsbase = arg2; if (doit) { /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, arg2); } put_cpu(); break; case ARCH_GET_FS: { unsigned long base; + if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base; + if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gsbase; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: - return prctl_map_vdso(&vdso_image_x32, addr); + return prctl_map_vdso(&vdso_image_x32, arg2); # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: - return prctl_map_vdso(&vdso_image_32, addr); + return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: - return prctl_map_vdso(&vdso_image_64, addr); + return prctl_map_vdso(&vdso_image_64, arg2); #endif default: @@ -621,10 +624,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + long ret; + + ret = do_arch_prctl_64(current, option, arg2); + if (ret == -EINVAL) + ret = do_arch_prctl_common(current, option, arg2); + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION +COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl(current, code, addr); + return do_arch_prctl_common(current, option, arg2); } +#endif unsigned long KSTK_ESP(struct task_struct *task) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2364b23ea3e5..f37d18124648 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -396,12 +396,12 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl + * When changing the segment base, use do_arch_prctl_64 * to set either thread.fs or thread.fsindex and the * corresponding GDT slot. */ if (child->thread.fsbase != value) - return do_arch_prctl(child, ARCH_SET_FS, value); + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -410,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl(child, ARCH_SET_GS, value); + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } @@ -869,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request, Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); + ret = do_arch_prctl_64(child, data, addr); break; #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 4194d6f9bb29..067f9813fd2c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -228,7 +228,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { .ident = "ASUS EeeBook X205TA", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"), + DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"), }, }, { /* Handle problems with rebooting on ASUS EeeBook X205TAW */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7cd7bbefd418..1302c47c5ce1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1223,21 +1223,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); -#ifdef CONFIG_X86_32 - /* sync back kernel address range */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - - /* - * sync back low identity map too. It is used for example - * in the 32-bit EFI stub. - */ - clone_pgd_range(initial_page_table, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); -#endif - tboot_probe(); map_vsyscall(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9820d6d977c6..bb1e8cc0bc84 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu) pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, 0x2 | DESCTYPE_S, 0x8); gdt.s = 1; - write_gdt_entry(get_cpu_gdt_table(cpu), + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); #endif } @@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); + +#ifdef CONFIG_X86_32 + /* + * Sync back kernel address range. We want to make sure that + * all kernel mappings, including percpu mappings, are available + * in the smpboot asm. We can't reliably pick up percpu + * mappings using vmalloc_fault(), because exception dispatch + * needs percpu data. + */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +#endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bd1f1ad35284..f04479a8f74f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long timeout; idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 50215a4b9347..207b8f2582c7 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -17,6 +17,8 @@ #include <linux/uaccess.h> #include <linux/elf.h> +#include <asm/elf.h> +#include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> @@ -101,7 +103,7 @@ out: static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { + if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -114,10 +116,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, if (current->flags & PF_RANDOMIZE) { *begin = randomize_page(*begin, 0x02000000); } - } else { - *begin = current->mm->mmap_legacy_base; - *end = TASK_SIZE; + return; } + + *begin = get_mmap_base(1); + *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); } unsigned long @@ -176,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legacy mmap base */ - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) + if (!in_compat_syscall() && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ @@ -191,7 +194,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1b812b..5db0f33cbf2c 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(&tboot_mm, vaddr); - pud = pud_alloc(&tboot_mm, pgd, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; pmd = pmd_alloc(&tboot_mm, pud, vaddr); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6c8934406dc9..dcd699baea1b 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) + if (LDT_empty(info) || LDT_zero(info)) { desc->a = desc->b = 0; - else + } else { fill_ldt(desc, info); + + /* + * Always set the accessed bit so that the CPU + * doesn't try to write to the (read-only) GDT. + */ + desc->type |= 1; + } ++info; ++desc; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 23ee89ce59a9..62597c300d94 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) struct vm_area_struct *vma; spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm) pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; - pud = pud_offset(pgd, 0xA0000); + p4d = p4d_offset(pgd, 0xA0000); + if (p4d_none_or_clear_bad(p4d)) + goto out; + pud = pud_offset(p4d, 0xA0000); if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d1efe2c62b3f..c02b9af2056a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -741,7 +741,6 @@ static int svm_hardware_enable(void) struct svm_cpu_data *sd; uint64_t efer; - struct desc_ptr gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -763,8 +762,7 @@ static int svm_hardware_enable(void) sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - native_store_gdt(&gdt_descr); - gdt = (struct desc_struct *)gdt_descr.address; + gdt = get_current_gdt_rw(); sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); wrmsrl(MSR_EFER, efer | EFER_SVME); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 283aa8601833..3acde663dc58 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -static DEFINE_PER_CPU(struct desc_ptr, host_gdt); /* * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we @@ -2052,14 +2051,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) */ static unsigned long segment_base(u16 selector) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *table; unsigned long v; if (!(selector & ~SEGMENT_RPL_MASK)) return 0; - table = (struct desc_struct *)gdt->address; + table = get_current_gdt_ro(); if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { u16 ldt_selector = kvm_read_ldt(); @@ -2164,7 +2162,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - load_gdt(this_cpu_ptr(&host_gdt)); + load_fixmap_gdt(raw_smp_processor_id()); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -2266,7 +2264,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (!already_loaded) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -2277,7 +2275,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ vmcs_writel(HOST_TR_BASE, (unsigned long)this_cpu_ptr(&cpu_tss)); - vmcs_writel(HOST_GDTR_BASE, gdt->address); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* * VM exits change the host TR limit to 0x67 after a VM @@ -3465,8 +3463,6 @@ static int hardware_enable(void) ept_sync_global(); } - native_store_gdt(this_cpu_ptr(&host_gdt)); - return 0; } @@ -7258,9 +7254,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) static int handle_vmclear(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 zero = 0; gpa_t vmptr; - struct vmcs12 *vmcs12; - struct page *page; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -7271,22 +7266,9 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { - /* - * For accurate processor emulation, VMCLEAR beyond available - * physical memory should do nothing at all. However, it is - * possible that a nested vmx bug, not a guest hypervisor bug, - * resulted in this case, so let's shut down before doing any - * more damage: - */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return 1; - } - vmcs12 = kmap(page); - vmcs12->launch_state = 0; - kunmap(page); - nested_release_page(page); + kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, launch_state), + &zero, sizeof(zero)); nested_free_vmcs02(vmx, vmptr); @@ -9694,10 +9676,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, return false; page = nested_get_page(vcpu, vmcs12->msr_bitmap); - if (!page) { - WARN_ON(1); + if (!page) return false; - } msr_bitmap_l1 = (unsigned long *)kmap(page); memset(msr_bitmap_l0, 0xff, PAGE_SIZE); @@ -11121,8 +11101,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ static void vmx_leave_nested(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.nested_run_pending = 0; nested_vmx_vmexit(vcpu, -1, 0, 0); + } free_nested(to_vmx(vcpu)); } diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..0fbdcb64f9f8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 58b5bee7ea27..9f305be71a72 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = { #define PTE_LEVEL_MULT (PAGE_SIZE) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT) #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ ({ \ @@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st, } } -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, - unsigned long P) +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) { int i; pte_t *start; pgprotval_t prot; - start = (pte_t *) pmd_page_vaddr(addr); + start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); @@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, #if PTRS_PER_PMD > 1 -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, - unsigned long P) +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; pmd_t *start; pgprotval_t prot; - start = (pmd_t *) pud_page_vaddr(addr); + start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { @@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); } -static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, - unsigned long P) +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; pud_t *start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *) pgd_page_vaddr(addr); + start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); @@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } #else -#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) +#define p4d_large(a) pud_large(__pud(p4d_val(a))) +#define p4d_none(a) pud_none(__pud(p4d_val(a))) +#endif + +#if PTRS_PER_P4D > 1 + +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) +{ + int i; + p4d_t *start; + pgprotval_t prot; + + start = (p4d_t *)pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_P4D; i++) { + st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); + if (!p4d_none(*start)) { + if (p4d_large(*start) || !p4d_present(*start)) { + prot = p4d_flags(*start); + note_page(m, st, __pgprot(prot), 2); + } else { + walk_pud_level(m, st, *start, + P + i * P4D_LEVEL_MULT); + } + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) +#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) +#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) #endif static inline bool is_hypervisor_range(int idx) @@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); } else { - walk_pud_level(m, &st, *start, + walk_p4d_level(m, &st, *start, i * PGD_LEVEL_MULT); } } else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e31763cb9..8ad91a01cbc8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would - * set_pud. + * set_p4d/set_pud. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif - pmd = pmd_offset(pud_offset(pgd, address), address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* @@ -425,6 +435,7 @@ void vmalloc_sync_all(void) static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; + p4d_t *p4d, *p4d_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); - } else { + } else if (CONFIG_PGTABLE_LEVELS > 4) { + /* + * With folded p4d, pgd_none() is always false, so the pgd may + * point to an empty page table entry and pgd_page_vaddr() + * will return garbage. + * + * We will do the correct sanity check on the p4d level. + */ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } + /* With 4-level paging, copying happens on the p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_ref = p4d_offset(pgd_ref, address); + if (p4d_none(*p4d_ref)) + return -1; + + if (p4d_none(*p4d)) { + set_p4d(p4d, *p4d_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + } + /* * Below here mismatches are bugs because these lower tables * are shared: */ - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); + pud = pud_offset(p4d, address); + pud_ref = pud_offset(p4d_ref, address); if (pud_none(*pud_ref)) return -1; @@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + printk("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; @@ -1082,6 +1122,7 @@ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pgd_present(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c deleted file mode 100644 index 1f3b6ef105cd..000000000000 --- a/arch/x86/mm/gup.c +++ /dev/null @@ -1,475 +0,0 @@ -/* - * Lockless get_user_pages_fast for x86 - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmstat.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/memremap.h> - -#include <asm/mmu_context.h> -#include <asm/pgtable.h> - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#ifndef CONFIG_X86_PAE - return READ_ONCE(*ptep); -#else - /* - * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a pte will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees l iff pte_high - * sees h. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have got rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. And get_user_pages_fast only operates on present ptes, so - * we're safe. - * - * gup_get_pte should not be used or copied outside gup.c without being - * very careful -- it does not atomically load the pte or anything that - * is likely to be useful for you. - */ - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#endif -} - -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) -{ - while ((*nr) - nr_start) { - struct page *page = pages[--(*nr)]; - - ClearPageReferenced(page); - put_page(page); - } -} - -/* - * 'pteval' can come from a pte, pmd or pud. We only check - * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. - */ -static inline int pte_allows_gup(unsigned long pteval, int write) -{ - unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_RW; - - if ((pteval & need_pte_bits) != need_pte_bits) - return 0; - - /* Check memory protection keys permissions. */ - if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) - return 0; - - return 1; -} - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; - pte_t *ptep, *ptem; - - /* - * Keep the original mapped PTE value (ptem) around since we - * might increment ptep off the end of the page when finishing - * our loop iteration. - */ - ptem = ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) - break; - - if (!pte_allows_gup(pte_val(pte), write)) - break; - - if (pte_devmap(pte)) { - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - break; - } - } else if (pte_special(pte)) - break; - - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - put_dev_pagemap(pgmap); - SetPageReferenced(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - if (addr == end) - ret = 1; - pte_unmap(ptem); - - return ret; -} - -static inline void get_head_page_multiple(struct page *page, int nr) -{ - VM_BUG_ON_PAGE(page != compound_head(page), page); - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, nr); - SetPageReferenced(page); -} - -static int __gup_device_huge(unsigned long pfn, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - int nr_start = *nr; - struct dev_pagemap *pgmap = NULL; - - do { - struct page *page = pfn_to_page(pfn); - - pgmap = get_dev_pagemap(pfn, pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - return 0; - } - SetPageReferenced(page); - pages[*nr] = page; - get_page(page); - put_dev_pagemap(pgmap); - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); - return 1; -} - -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pmd_val(pmd), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); - if (pmd_devmap(pmd)) - return __gup_device_huge_pmd(pmd, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) - return 0; - if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; - } - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static noinline int gup_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pud_val(pud), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); - if (pud_devmap(pud)) - return __gup_device_huge_pud(pud, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); - - refs = 0; - head = pud_page(pud); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&pgd, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pud, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } - } while (pudp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) - return 0; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start) - goto slow_irqon; - -#ifdef CONFIG_X86_64 - if (end >> __VIRTUAL_MASK_SHIFT) - goto slow_irqon; -#endif - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index c5066a260803..302f43fd9c28 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -12,10 +12,12 @@ #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> +#include <linux/compat.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> +#include <asm/elf.h> #if 0 /* This is just for testing */ struct page * @@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; - info.low_limit = current->mm->mmap_legacy_base; - info.high_limit = TASK_SIZE; + info.low_limit = get_mmap_base(1); + info.high_limit = in_compat_syscall() ? + tasksize_32bit() : tasksize_64bit(); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4f8b90..04210a29dd60 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, return 0; } +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + ident_pud_init(info, pud, addr, next); + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + ident_pud_init(info, pud, addr, next); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend) { @@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); - pud_t *pud; + p4d_t *p4d; next = (addr & PGDIR_MASK) + PGDIR_SIZE; if (next > end) next = end; if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; continue; } - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * The pgd entry has to point to the pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } } return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 4dddfaf6569a..7116a727fd5a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,6 +67,7 @@ bool __read_mostly __vmalloc_start_set = false; */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -75,13 +76,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; @@ -390,8 +393,11 @@ pte_t *kmap_pte; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); } static void __init kmap_init(void) @@ -410,6 +416,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -418,7 +425,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; @@ -450,6 +458,7 @@ void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -469,7 +478,8 @@ void __init native_pagetable_init(void) if (!pgd_present(*pgd)) break; - pud = pud_offset(pgd, va); + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); pmd = pmd_offset(pud, va); if (!pmd_present(*pmd)) break; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 15173d37f399..7bdda6f1d135 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end) unsigned long address; for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); + pgd_t *pgd_ref = pgd_offset_k(address); + const p4d_t *p4d_ref; struct page *page; - if (pgd_none(*pgd_ref)) + /* + * With folded p4d, pgd_none() is always false, we need to + * handle synchonization on p4d level. + */ + BUILD_BUG_ON(pgd_none(*pgd_ref)); + p4d_ref = p4d_offset(pgd_ref, address); + + if (p4d_none(*p4d_ref)) continue; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(address); + p4d = p4d_offset(pgd, address); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) - BUG_ON(pgd_page_vaddr(*pgd) - != pgd_page_vaddr(*pgd_ref)); + if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) + BUG_ON(p4d_page_vaddr(*p4d) + != p4d_page_vaddr(*p4d_ref)); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); + if (p4d_none(*p4d)) + set_p4d(p4d, *p4d_ref); spin_unlock(pgt_lock); } @@ -149,16 +159,28 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { - pud_t *pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) + p4d_t *p4d = (p4d_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, p4d); + if (p4d != p4d_offset(pgd, 0)) printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); + p4d, p4d_offset(pgd, 0)); + } + return p4d_offset(pgd, vaddr); +} + +static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) +{ + if (p4d_none(*p4d)) { + pud_t *pud = (pud_t *)spp_getpage(); + p4d_populate(&init_mm, p4d, pud); + if (pud != pud_offset(p4d, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pud, pud_offset(p4d, 0)); } - return pud_offset(pgd, vaddr); + return pud_offset(p4d, vaddr); } static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) @@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); if (pmd != pmd_offset(pud, 0)) - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", pmd, pmd_offset(pud, 0)); } return pmd_offset(pud, vaddr); @@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); if (pte != pte_offset_kernel(pmd, 0)) - printk(KERN_ERR "PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #03!\n"); } return pte_offset_kernel(pmd, vaddr); } -void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pud = pud_page + pud_index(vaddr); - pmd = fill_pmd(pud, vaddr); - pte = fill_pte(pmd, vaddr); + pmd_t *pmd = fill_pmd(pud, vaddr); + pte_t *pte = fill_pte(pmd, vaddr); set_pte(pte, new_pte); @@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) +{ + p4d_t *p4d = p4d_page + p4d_index(vaddr); + pud_t *pud = fill_pud(p4d, vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud = pud_page + pud_index(vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; - pud_t *pud_page; + p4d_t *p4d_page; pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); @@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } - pud_page = (pud_t*)pgd_page_vaddr(*pgd); - set_pte_vaddr_pud(pud_page, vaddr, pteval); + + p4d_page = p4d_offset(pgd, 0); + set_pte_vaddr_p4d(p4d_page, vaddr, pteval); } pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(vaddr); - pud = fill_pud(pgd, vaddr); + p4d = fill_p4d(pgd, vaddr); + pud = fill_pud(p4d, vaddr); return fill_pmd(pud, vaddr); } @@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, enum page_cache_mode cache) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgprot_t prot; @@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); if (pgd_none(*pgd)) { + p4d = (p4d_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | + _PAGE_USER)); + } + p4d = p4d_offset(pgd, (unsigned long)__va(phys)); + if (p4d_none(*p4d)) { pud = (pud_t *) spp_getpage(); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | _PAGE_USER)); } - pud = pud_offset(pgd, (unsigned long)__va(phys)); + pud = pud_offset(p4d, (unsigned long)__va(phys)); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | @@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start, for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d; pud_t *pud; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; - if (pgd_val(*pgd)) { - pud = (pud_t *)pgd_page_vaddr(*pgd); + BUILD_BUG_ON(pgd_none(*pgd)); + p4d = p4d_offset(pgd, vaddr); + if (p4d_val(*p4d)) { + pud = (pud_t *)p4d_page_vaddr(*p4d); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); @@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, pud); + p4d_populate(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) spin_unlock(&init_mm.page_table_lock); } +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + /* free a pud talbe */ + free_pagetable(p4d_page(*p4d), 0); + spin_lock(&init_mm.page_table_lock); + p4d_clear(p4d); + spin_unlock(&init_mm.page_table_lock); +} + static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, bool direct) @@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, update_page_count(PG_LEVEL_1G, -pages); } +static void __meminit +remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pud_t *pud_base; + p4d_t *p4d; + + p4d = p4d_start + p4d_index(addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + BUILD_BUG_ON(p4d_large(*p4d)); + + pud_base = (pud_t *)p4d_page_vaddr(*p4d); + remove_pud_table(pud_base, addr, next, direct); + free_pud_table(pud_base, p4d); + } + + if (direct) + update_page_count(PG_LEVEL_512G, -pages); +} + /* start and end are both virtual address. */ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct) @@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) unsigned long next; unsigned long addr; pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); @@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) if (!pgd_present(*pgd)) continue; - pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, addr, next, direct); + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + remove_p4d_table(p4d, addr, next, direct); } flush_tlb_all(); @@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr) if (pgd_none(*pgd)) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return 0; + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return 0; @@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long addr; unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (!pgd) return -ENOMEM; - pud = vmemmap_pud_populate(pgd, addr, node); + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); if (!pud) return -ENOMEM; @@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, unsigned long end = (unsigned long)(start_page + size); unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; unsigned int nr_pages; @@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7aaa2635862d..a5e1cda85974 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -425,7 +425,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) /* Don't assume we're using swapper_pg_dir at this point */ pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(addr)]; - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8d63d7a104c3..7b81f01067f2 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -32,8 +32,19 @@ static int __init map_range(struct range *range) static void __init clear_pgds(unsigned long start, unsigned long end) { - for (; start < end; start += PGDIR_SIZE) - pgd_clear(pgd_offset_k(start)); + pgd_t *pgd; + + for (; start < end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() + * instead. + */ + if (CONFIG_PGTABLE_LEVELS < 5) + p4d_clear(p4d_offset(pgd, start)); + else + pgd_clear(pgd); + } } static void __init kasan_map_early_shadow(pgd_t *pgd) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 7940166c799b..19ad095b41df 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -30,30 +30,44 @@ #include <linux/limits.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> +#include <linux/compat.h> #include <asm/elf.h> struct va_alignment __read_mostly va_align = { .flags = -1, }; -static unsigned long stack_maxrandom_size(void) +unsigned long tasksize_32bit(void) +{ + return IA32_PAGE_OFFSET; +} + +unsigned long tasksize_64bit(void) +{ + return TASK_SIZE_MAX; +} + +static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; + max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max <<= PAGE_SHIFT; } return max; } -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole with possible stack randomization. - */ -#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) -#define MAX_GAP (TASK_SIZE/6*5) +#ifdef CONFIG_COMPAT +# define mmap32_rnd_bits mmap_rnd_compat_bits +# define mmap64_rnd_bits mmap_rnd_bits +#else +# define mmap32_rnd_bits mmap_rnd_bits +# define mmap64_rnd_bits mmap_rnd_bits +#endif + +#define SIZE_128M (128 * 1024 * 1024UL) static int mmap_is_legacy(void) { @@ -66,54 +80,91 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -unsigned long arch_mmap_rnd(void) +static unsigned long arch_rnd(unsigned int rndbits) { - unsigned long rnd; - - if (mmap_is_ia32()) -#ifdef CONFIG_COMPAT - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); -#else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); -#endif - else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; +} - return rnd << PAGE_SHIFT; +unsigned long arch_mmap_rnd(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap_min, gap_max; + + /* + * Top of mmap area (just below the process stack). + * Leave an at least ~128 MB hole with possible stack randomization. + */ + gap_min = SIZE_128M + stack_maxrandom_size(task_size); + gap_max = (task_size / 6) * 5; - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; + if (gap < gap_min) + gap = gap_min; + else if (gap > gap_max) + gap = gap_max; + + return PAGE_ALIGN(task_size - gap - rnd); +} - return PAGE_ALIGN(TASK_SIZE - gap - rnd); +static unsigned long mmap_legacy_base(unsigned long rnd, + unsigned long task_size) +{ + return __TASK_UNMAPPED_BASE(task_size) + rnd; } /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ +static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, + unsigned long random_factor, unsigned long task_size) +{ + *legacy_base = mmap_legacy_base(random_factor, task_size); + if (mmap_is_legacy()) + *base = *legacy_base; + else + *base = mmap_base(random_factor, task_size); +} + void arch_pick_mmap_layout(struct mm_struct *mm) { - unsigned long random_factor = 0UL; + if (mmap_is_legacy()) + mm->get_unmapped_area = arch_get_unmapped_area; + else + mm->get_unmapped_area = arch_get_unmapped_area_topdown; - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); + arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, + arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* + * The mmap syscall mapping base decision depends solely on the + * syscall type (64-bit or compat). This applies for 64bit + * applications and 32bit applications. The 64bit syscall uses + * mmap_base, the compat syscall uses mmap_compat_base. + */ + arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, + arch_rnd(mmap32_rnd_bits), tasksize_32bit()); +#endif +} - mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; +unsigned long get_mmap_base(int is_legacy) +{ + struct mm_struct *mm = current->mm; - if (mmap_is_legacy()) { - mm->mmap_base = mm->mmap_legacy_base; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + if (in_compat_syscall()) { + return is_legacy ? mm->mmap_compat_legacy_base + : mm->mmap_compat_base; } +#endif + return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; } const char *arch_vma_name(struct vm_area_struct *vma) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 28d42130243c..b5949017dead 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return NULL; + + *level = PG_LEVEL_512G; + if (p4d_large(*p4d) || !p4d_present(*p4d)) + return (pte_t *)p4d; + + pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; @@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) return NULL; @@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } @@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) pud_clear(pud); } -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { - pud_t *pud = pud_offset(pgd, start); + pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? @@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa, return num_pages; } -static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, + pgprot_t pgprot) { pud_t *pud; unsigned long end; @@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); /* * Need a PMD page? @@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (cpa->numpages == cur_pages) return cur_pages; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* @@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (start < end) { long tmp; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; @@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ + p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); + if (pgd_none(*pgd_entry)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!p4d) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + /* * Allocate a PUD page and hand it down for mapping. */ - if (pgd_none(*pgd_entry)) { + p4d = p4d_offset(pgd_entry, addr); + if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); if (!pud) return -1; - set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); - ret = populate_pud(cpa, addr, pgd_entry, pgprot); + ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ - unmap_pud_range(pgd_entry, addr, + unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff26bb96..38b6daf72deb 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -261,13 +261,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ return; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9adce776852b..3d275a791c76 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) BUG(); return; } - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { BUG(); return; diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index cef39b097649..3481268da3d0 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void) load_cr3(initial_page_table); __flush_tlb_all(); - gdt_descr.address = __pa(get_cpu_gdt_table(0)); + gdt_descr.address = get_cpu_gdt_paddr(0); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); @@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) { struct desc_ptr gdt_descr; - gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); + gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a4695da42d77..eb64e5b33e37 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -135,6 +135,7 @@ static pgd_t *efi_pgd; int __init efi_alloc_page_tables(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; gfp_t gfp_mask; @@ -147,15 +148,20 @@ int __init efi_alloc_page_tables(void) return -ENOMEM; pgd = efi_pgd + pgd_index(EFI_VA_END); + p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END); + if (!p4d) { + free_page((unsigned long)efi_pgd); + return -ENOMEM; + } - pud = pud_alloc_one(NULL, 0); + pud = pud_alloc(&init_mm, p4d, EFI_VA_END); if (!pud) { + if (CONFIG_PGTABLE_LEVELS > 4) + free_page((unsigned long) pgd_page_vaddr(*pgd)); free_page((unsigned long)efi_pgd); return -ENOMEM; } - pgd_populate(NULL, pgd, pud); - return 0; } @@ -166,6 +172,7 @@ void efi_sync_low_kernel_mappings(void) { unsigned num_entries; pgd_t *pgd_k, *pgd_efi; + p4d_t *p4d_k, *p4d_efi; pud_t *pud_k, *pud_efi; if (efi_enabled(EFI_OLD_MEMMAP)) @@ -190,23 +197,37 @@ void efi_sync_low_kernel_mappings(void) memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); /* + * As with PGDs, we share all P4D entries apart from the one entry + * that covers the EFI runtime mapping space. + */ + BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END)); + BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK)); + + pgd_efi = efi_pgd + pgd_index(EFI_VA_END); + pgd_k = pgd_offset_k(EFI_VA_END); + p4d_efi = p4d_offset(pgd_efi, 0); + p4d_k = p4d_offset(pgd_k, 0); + + num_entries = p4d_index(EFI_VA_END); + memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries); + + /* * We share all the PUD entries apart from those that map the * EFI regions. Copy around them. */ BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0); BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); - pgd_efi = efi_pgd + pgd_index(EFI_VA_END); - pud_efi = pud_offset(pgd_efi, 0); - - pgd_k = pgd_offset_k(EFI_VA_END); - pud_k = pud_offset(pgd_k, 0); + p4d_efi = p4d_offset(pgd_efi, EFI_VA_END); + p4d_k = p4d_offset(pgd_k, EFI_VA_END); + pud_efi = pud_offset(p4d_efi, 0); + pud_k = pud_offset(p4d_k, 0); num_entries = pud_index(EFI_VA_END); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); - pud_efi = pud_offset(pgd_efi, EFI_VA_START); - pud_k = pud_offset(pgd_k, EFI_VA_START); + pud_efi = pud_offset(p4d_efi, EFI_VA_START); + pud_k = pud_offset(p4d_k, EFI_VA_START); num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 66ade16c7693..6b05a9219ea2 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt) * 'pmode_gdt' in wakeup_start. */ ctxt->gdt_desc.size = GDT_SIZE - 1; - ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); + ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id()); store_tr(ctxt->tr); @@ -162,7 +162,7 @@ static void fix_processor_context(void) int cpu = smp_processor_id(); struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 - struct desc_struct *desc = get_cpu_gdt_table(cpu); + struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif set_tss_desc(cpu, t); /* @@ -183,6 +183,9 @@ static void fix_processor_context(void) load_mm_ldt(current->active_mm); /* This does lldt */ fpu__resume_cpu(); + + /* The processor is back on the direct GDT, load back the fixmap */ + load_fixmap_gdt(cpu); } /** diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 9f14bd34581d..c35fdb585c68 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -32,6 +32,7 @@ pgd_t *resume_pg_dir; */ static pmd_t *resume_one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd) return NULL; set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); #else - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); #endif diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index ded2e8272382..2a9f993bbbf0 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -49,6 +49,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; + p4d_t *p4d; /* * The new mapping only has to cover the page containing the image @@ -63,6 +64,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * the virtual address space after switching over to the original page * tables used by the image kernel. */ + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); + if (!p4d) + return -ENOMEM; + } + pud = (pud_t *)get_safe_page(GFP_ATOMIC); if (!pud) return -ENOMEM; @@ -75,8 +83,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - set_pgd(pgd + pgd_index(restore_jump_address), - __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* No p4d for 4-level paging: point the pgd to the pud page table */ + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); + } return 0; } @@ -124,7 +137,10 @@ static int set_up_temporary_mappings(void) static int relocate_restore_code(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; + pmd_t *pmd; + pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) @@ -134,22 +150,25 @@ static int relocate_restore_code(void) /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); - pud = pud_offset(pgd, relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); if (pud_large(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - } else { - pmd_t *pmd = pmd_offset(pud, relocated_restore_code); - - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - } else { - pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); - } + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: __flush_tlb_all(); - return 0; } diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index b6d5c8946e66..470edad96bb9 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -10,22 +10,19 @@ * Version 2. See the file COPYING for more details. */ +#include <linux/bug.h> +#include <asm/purgatory.h> + #include "sha256.h" -#include "purgatory.h" #include "../boot/string.h" -struct sha_region { - unsigned long start; - unsigned long len; -}; - -static unsigned long backup_dest; -static unsigned long backup_src; -static unsigned long backup_sz; +unsigned long purgatory_backup_dest __section(.kexec-purgatory); +unsigned long purgatory_backup_src __section(.kexec-purgatory); +unsigned long purgatory_backup_sz __section(.kexec-purgatory); -static u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 }; +u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory); -struct sha_region sha_regions[16] = {}; +struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory); /* * On x86, second kernel requries first 640K of memory to boot. Copy @@ -34,26 +31,28 @@ struct sha_region sha_regions[16] = {}; */ static int copy_backup_region(void) { - if (backup_dest) - memcpy((void *)backup_dest, (void *)backup_src, backup_sz); - + if (purgatory_backup_dest) { + memcpy((void *)purgatory_backup_dest, + (void *)purgatory_backup_src, purgatory_backup_sz); + } return 0; } static int verify_sha256_digest(void) { - struct sha_region *ptr, *end; + struct kexec_sha_region *ptr, *end; u8 digest[SHA256_DIGEST_SIZE]; struct sha256_state sctx; sha256_init(&sctx); - end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])]; - for (ptr = sha_regions; ptr < end; ptr++) + end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions); + + for (ptr = purgatory_sha_regions; ptr < end; ptr++) sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); sha256_final(&sctx, digest); - if (memcmp(digest, sha256_digest, sizeof(digest))) + if (memcmp(digest, purgatory_sha256_digest, sizeof(digest))) return 1; return 0; diff --git a/arch/x86/purgatory/purgatory.h b/arch/x86/purgatory/purgatory.h deleted file mode 100644 index e2e365a6c192..000000000000 --- a/arch/x86/purgatory/purgatory.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef PURGATORY_H -#define PURGATORY_H - -#ifndef __ASSEMBLY__ -extern void purgatory(void); -#endif /* __ASSEMBLY__ */ - -#endif /* PURGATORY_H */ diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S index f90e9dfa90bb..dfae9b9e60b5 100644 --- a/arch/x86/purgatory/setup-x86_64.S +++ b/arch/x86/purgatory/setup-x86_64.S @@ -9,7 +9,7 @@ * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ -#include "purgatory.h" +#include <asm/purgatory.h> .text .globl purgatory_start diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h index bd15a4127735..2867d9825a57 100644 --- a/arch/x86/purgatory/sha256.h +++ b/arch/x86/purgatory/sha256.h @@ -10,7 +10,6 @@ #ifndef SHA256_H #define SHA256_H - #include <linux/types.h> #include <crypto/sha.h> diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index e7e7055a8658..69f0827d5f53 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \ ifeq ($(CONFIG_X86_32),y) -obj-y += checksum_32.o +obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index e59eef20647b..b291ca5cf66b 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -78,7 +78,7 @@ static inline int ptrace_set_thread_area(struct task_struct *child, int idx, return -ENOSYS; } -extern long arch_prctl(struct task_struct *task, int code, +extern long arch_prctl(struct task_struct *task, int option, unsigned long __user *addr); #endif diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c index 96eb2bd28832..8431e87ac333 100644 --- a/arch/x86/um/os-Linux/prctl.c +++ b/arch/x86/um/os-Linux/prctl.c @@ -6,7 +6,7 @@ #include <sys/ptrace.h> #include <asm/ptrace.h> -int os_arch_prctl(int pid, int code, unsigned long *addr) +int os_arch_prctl(int pid, int option, unsigned long *arg2) { - return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) addr, code); + return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option); } diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c new file mode 100644 index 000000000000..627d68836b16 --- /dev/null +++ b/arch/x86/um/syscalls_32.c @@ -0,0 +1,7 @@ +#include <linux/syscalls.h> +#include <os.h> + +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) +{ + return -EINVAL; +} diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index 10d907098c26..58f51667e2e4 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -7,13 +7,15 @@ #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/syscalls.h> #include <linux/uaccess.h> #include <asm/prctl.h> /* XXX This should get the constants from libc */ #include <os.h> -long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) +long arch_prctl(struct task_struct *task, int option, + unsigned long __user *arg2) { - unsigned long *ptr = addr, tmp; + unsigned long *ptr = arg2, tmp; long ret; int pid = task->mm->context.id.u.pid; @@ -30,7 +32,7 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) * arch_prctl is run on the host, then the registers are read * back. */ - switch (code) { + switch (option) { case ARCH_SET_FS: case ARCH_SET_GS: ret = restore_registers(pid, ¤t->thread.regs.regs); @@ -50,11 +52,11 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) ptr = &tmp; } - ret = os_arch_prctl(pid, code, ptr); + ret = os_arch_prctl(pid, option, ptr); if (ret) return ret; - switch (code) { + switch (option) { case ARCH_SET_FS: current->thread.arch.fs = (unsigned long) ptr; ret = save_registers(pid, ¤t->thread.regs.regs); @@ -63,19 +65,19 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) ret = save_registers(pid, ¤t->thread.regs.regs); break; case ARCH_GET_FS: - ret = put_user(tmp, addr); + ret = put_user(tmp, arg2); break; case ARCH_GET_GS: - ret = put_user(tmp, addr); + ret = put_user(tmp, arg2); break; } return ret; } -long sys_arch_prctl(int code, unsigned long addr) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return arch_prctl(current, code, (unsigned long __user *) addr); + return arch_prctl(current, option, (unsigned long __user *) arg2); } void arch_switch_to(struct task_struct *to) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bc3dab5d47ca..6efa0cc425a2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -710,7 +710,7 @@ static void load_TLS_descriptor(struct thread_struct *t, *shadow = t->tls_array[i]; - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); mc = __xen_mc_entry(0); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 37cb5aad71de..4d4b7bc48f5d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -535,40 +535,41 @@ static pgd_t *xen_get_user_pgd(pgd_t *pgd) return user_ptr; } -static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) { struct mmu_update u; u.ptr = virt_to_machine(ptr).maddr; - u.val = pgd_val_ma(val); + u.val = p4d_val_ma(val); xen_extend_mmu_update(&u); } /* - * Raw hypercall-based set_pgd, intended for in early boot before + * Raw hypercall-based set_p4d, intended for in early boot before * there's a page structure. This implies: * 1. The only existing pagetable is the kernel's * 2. It is always pinned * 3. It has no user pagetable attached to it */ -static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) { preempt_disable(); xen_mc_batch(); - __xen_set_pgd_hyper(ptr, val); + __xen_set_p4d_hyper(ptr, val); xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); } -static void xen_set_pgd(pgd_t *ptr, pgd_t val) +static void xen_set_p4d(p4d_t *ptr, p4d_t val) { - pgd_t *user_ptr = xen_get_user_pgd(ptr); + pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); + pgd_t pgd_val; - trace_xen_mmu_set_pgd(ptr, user_ptr, val); + trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); /* If page is not pinned, we can just update the entry directly */ @@ -576,7 +577,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) *ptr = val; if (user_ptr) { WARN_ON(xen_page_pinned(user_ptr)); - *user_ptr = val; + pgd_val.pgd = p4d_val_ma(val); + *user_ptr = pgd_val; } return; } @@ -585,14 +587,72 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) user updates together. */ xen_mc_batch(); - __xen_set_pgd_hyper(ptr, val); + __xen_set_p4d_hyper(ptr, val); if (user_ptr) - __xen_set_pgd_hyper(user_ptr, val); + __xen_set_p4d_hyper((p4d_t *)user_ptr, val); xen_mc_issue(PARAVIRT_LAZY_MMU); } #endif /* CONFIG_PGTABLE_LEVELS == 4 */ +static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; + for (i = 0; i < nr; i++) { + if (!pmd_none(pmd[i])) + flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); + } + return flush; +} + +static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; + for (i = 0; i < nr; i++) { + pmd_t *pmd; + + if (pud_none(pud[i])) + continue; + + pmd = pmd_offset(&pud[i], 0); + if (PTRS_PER_PMD > 1) + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); + flush |= xen_pmd_walk(mm, pmd, func, + last && i == nr - 1, limit); + } + return flush; +} + +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; + for (i = 0; i < nr; i++) { + pud_t *pud; + + if (p4d_none(p4d[i])) + continue; + + pud = pud_offset(&p4d[i], 0); + if (PTRS_PER_PUD > 1) + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); + flush |= xen_pud_walk(mm, pud, func, + last && i == nr - 1, limit); + } + return flush; +} + /* * (Yet another) pagetable walker. This one is intended for pinning a * pagetable. This means that it walks a pagetable and calls the @@ -613,10 +673,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, enum pt_level), unsigned long limit) { - int flush = 0; + int i, nr, flush = 0; unsigned hole_low, hole_high; - unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; - unsigned pgdidx, pudidx, pmdidx; /* The limit is the last byte to be touched */ limit--; @@ -633,65 +691,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, hole_low = pgd_index(USER_LIMIT); hole_high = pgd_index(PAGE_OFFSET); - pgdidx_limit = pgd_index(limit); -#if PTRS_PER_PUD > 1 - pudidx_limit = pud_index(limit); -#else - pudidx_limit = 0; -#endif -#if PTRS_PER_PMD > 1 - pmdidx_limit = pmd_index(limit); -#else - pmdidx_limit = 0; -#endif - - for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { - pud_t *pud; + nr = pgd_index(limit) + 1; + for (i = 0; i < nr; i++) { + p4d_t *p4d; - if (pgdidx >= hole_low && pgdidx < hole_high) + if (i >= hole_low && i < hole_high) continue; - if (!pgd_val(pgd[pgdidx])) + if (pgd_none(pgd[i])) continue; - pud = pud_offset(&pgd[pgdidx], 0); - - if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - - for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { - pmd_t *pmd; - - if (pgdidx == pgdidx_limit && - pudidx > pudidx_limit) - goto out; - - if (pud_none(pud[pudidx])) - continue; - - pmd = pmd_offset(&pud[pudidx], 0); - - if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); - - for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { - struct page *pte; - - if (pgdidx == pgdidx_limit && - pudidx == pudidx_limit && - pmdidx > pmdidx_limit) - goto out; - - if (pmd_none(pmd[pmdidx])) - continue; - - pte = pmd_page(pmd[pmdidx]); - flush |= (*func)(mm, pte, PT_PTE); - } - } + p4d = p4d_offset(&pgd[i], 0); + if (PTRS_PER_P4D > 1) + flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); + flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); } -out: /* Do the top level last, so that the callbacks can use it as a cue to do final things like tlb flushes. */ flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); @@ -1150,57 +1165,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) xen_free_ro_pages(pa, PAGE_SIZE); } +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) +{ + unsigned long pa; + pte_t *pte_tbl; + int i; + + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + return; + } + + pte_tbl = pte_offset_kernel(pmd, 0); + for (i = 0; i < PTRS_PER_PTE; i++) { + if (pte_none(pte_tbl[i])) + continue; + pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + set_pmd(pmd, __pmd(0)); + xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); +} + +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) +{ + unsigned long pa; + pmd_t *pmd_tbl; + int i; + + if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + return; + } + + pmd_tbl = pmd_offset(pud, 0); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (pmd_none(pmd_tbl[i])) + continue; + xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); + } + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); +} + +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) +{ + unsigned long pa; + pud_t *pud_tbl; + int i; + + if (p4d_large(*p4d)) { + pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, P4D_SIZE); + return; + } + + pud_tbl = pud_offset(p4d, 0); + for (i = 0; i < PTRS_PER_PUD; i++) { + if (pud_none(pud_tbl[i])) + continue; + xen_cleanmfnmap_pud(pud_tbl + i, unpin); + } + set_p4d(p4d, __p4d(0)); + xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); +} + /* * Since it is well isolated we can (and since it is perhaps large we should) * also free the page tables mapping the initial P->M table. */ static void __init xen_cleanmfnmap(unsigned long vaddr) { - unsigned long va = vaddr & PMD_MASK; - unsigned long pa; - pgd_t *pgd = pgd_offset_k(va); - pud_t *pud_page = pud_offset(pgd, 0); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgd; + p4d_t *p4d; unsigned int i; bool unpin; unpin = (vaddr == 2 * PGDIR_SIZE); - set_pgd(pgd, __pgd(0)); - do { - pud = pud_page + pud_index(va); - if (pud_none(*pud)) { - va += PUD_SIZE; - } else if (pud_large(*pud)) { - pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PUD_SIZE); - va += PUD_SIZE; - } else { - pmd = pmd_offset(pud, va); - if (pmd_large(*pmd)) { - pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PMD_SIZE); - } else if (!pmd_none(*pmd)) { - pte = pte_offset_kernel(pmd, va); - set_pmd(pmd, __pmd(0)); - for (i = 0; i < PTRS_PER_PTE; ++i) { - if (pte_none(pte[i])) - break; - pa = pte_pfn(pte[i]) << PAGE_SHIFT; - xen_free_ro_pages(pa, PAGE_SIZE); - } - xen_cleanmfnmap_free_pgtbl(pte, unpin); - } - va += PMD_SIZE; - if (pmd_index(va)) - continue; - set_pud(pud, __pud(0)); - xen_cleanmfnmap_free_pgtbl(pmd, unpin); - } - - } while (pud_index(va) || pmd_index(va)); - xen_cleanmfnmap_free_pgtbl(pud_page, unpin); + vaddr &= PMD_MASK; + pgd = pgd_offset_k(vaddr); + p4d = p4d_offset(pgd, 0); + for (i = 0; i < PTRS_PER_P4D; i++) { + if (p4d_none(p4d[i])) + continue; + xen_cleanmfnmap_p4d(p4d + i, unpin); + } + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(0)); + xen_cleanmfnmap_free_pgtbl(p4d, unpin); + } } static void __init xen_pagetable_p2m_free(void) @@ -1538,7 +1593,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); } #endif - return ret; } @@ -1730,7 +1784,7 @@ static void xen_release_pmd(unsigned long pfn) xen_release_ptpage(pfn, PT_PMD); } -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); @@ -2071,21 +2125,27 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) */ void __init xen_relocate_p2m(void) { - phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; - int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; pte_t *pt; pmd_t *pmd; pud_t *pud; + p4d_t *p4d = NULL; pgd_t *pgd; unsigned long *new_p2m; + int save_pud; size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; - n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; - n_frames = n_pte + n_pt + n_pmd + n_pud; + n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; + if (PTRS_PER_P4D > 1) + n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + else + n_p4d = 0; + n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; new_area = xen_find_free_area(PFN_PHYS(n_frames)); if (!new_area) { @@ -2101,55 +2161,76 @@ void __init xen_relocate_p2m(void) * To avoid any possible virtual address collision, just use * 2 * PUD_SIZE for the new area. */ - pud_phys = new_area; + p4d_phys = new_area; + pud_phys = p4d_phys + PFN_PHYS(n_p4d); pmd_phys = pud_phys + PFN_PHYS(n_pud); pt_phys = pmd_phys + PFN_PHYS(n_pmd); p2m_pfn = PFN_DOWN(pt_phys) + n_pt; pgd = __va(read_cr3()); new_p2m = (unsigned long *)(2 * PGDIR_SIZE); - for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { - pud = early_memremap(pud_phys, PAGE_SIZE); - clear_page(pud); - for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); - idx_pmd++) { - pmd = early_memremap(pmd_phys, PAGE_SIZE); - clear_page(pmd); - for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); - idx_pt++) { - pt = early_memremap(pt_phys, PAGE_SIZE); - clear_page(pt); - for (idx_pte = 0; - idx_pte < min(n_pte, PTRS_PER_PTE); - idx_pte++) { - set_pte(pt + idx_pte, - pfn_pte(p2m_pfn, PAGE_KERNEL)); - p2m_pfn++; + idx_p4d = 0; + save_pud = n_pud; + do { + if (n_p4d > 0) { + p4d = early_memremap(p4d_phys, PAGE_SIZE); + clear_page(p4d); + n_pud = min(save_pud, PTRS_PER_P4D); + } + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; } - n_pte -= PTRS_PER_PTE; - early_memunmap(pt, PAGE_SIZE); - make_lowmem_page_readonly(__va(pt_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, - PFN_DOWN(pt_phys)); - set_pmd(pmd + idx_pt, - __pmd(_PAGE_TABLE | pt_phys)); - pt_phys += PAGE_SIZE; + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; } - n_pt -= PTRS_PER_PMD; - early_memunmap(pmd, PAGE_SIZE); - make_lowmem_page_readonly(__va(pmd_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, - PFN_DOWN(pmd_phys)); - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); - pmd_phys += PAGE_SIZE; + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + if (n_p4d > 0) + set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); + else + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; } - n_pmd -= PTRS_PER_PUD; - early_memunmap(pud, PAGE_SIZE); - make_lowmem_page_readonly(__va(pud_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); - set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); - pud_phys += PAGE_SIZE; - } + if (n_p4d > 0) { + save_pud -= PTRS_PER_P4D; + early_memunmap(p4d, PAGE_SIZE); + make_lowmem_page_readonly(__va(p4d_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); + set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); + p4d_phys += PAGE_SIZE; + } + } while (++idx_p4d < n_p4d); /* Now copy the old p2m info to the new area. */ memcpy(new_p2m, xen_p2m_addr, size); @@ -2326,6 +2407,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: + case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: /* All local page mappings */ pte = pfn_pte(phys, prot); break; @@ -2378,8 +2460,8 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -#if CONFIG_PGTABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = xen_set_pgd; +#if CONFIG_PGTABLE_LEVELS >= 4 + pv_mmu_ops.set_p4d = xen_set_p4d; #endif /* This will work as long as patching hasn't happened yet @@ -2388,7 +2470,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif @@ -2454,10 +2536,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 .pud_val = PV_CALLEE_SAVE(xen_pud_val), .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_pgd = xen_set_pgd_hyper, + .set_p4d = xen_set_p4d_hyper, .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 73809bb951b4..3fe2b3292915 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -5,6 +5,7 @@ enum pt_level { PT_PGD, + PT_P4D, PT_PUD, PT_PMD, PT_PTE diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7ff2f1bfb7ec..eaa36162ed4a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -392,7 +392,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) if (ctxt == NULL) return -ENOMEM; - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); #ifdef CONFIG_X86_32 ctxt->user_regs.fs = __KERNEL_PERCPU; diff --git a/block/bio.c b/block/bio.c index 5eec5e08417f..e75878f8b14a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -376,10 +376,14 @@ static void punt_bios_to_rescuer(struct bio_set *bs) bio_list_init(&punt); bio_list_init(&nopunt); - while ((bio = bio_list_pop(current->bio_list))) + while ((bio = bio_list_pop(¤t->bio_list[0]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + current->bio_list[0] = nopunt; - *current->bio_list = nopunt; + bio_list_init(&nopunt); + while ((bio = bio_list_pop(¤t->bio_list[1]))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + current->bio_list[1] = nopunt; spin_lock(&bs->rescue_lock); bio_list_merge(&bs->rescue_list, &punt); @@ -466,7 +470,9 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * we retry with the original gfp_flags. */ - if (current->bio_list && !bio_list_empty(current->bio_list)) + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); diff --git a/block/blk-core.c b/block/blk-core.c index 0eeb99ef654f..d772c221cc17 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1973,7 +1973,14 @@ end_io: */ blk_qc_t generic_make_request(struct bio *bio) { - struct bio_list bio_list_on_stack; + /* + * bio_list_on_stack[0] contains bios submitted by the current + * make_request_fn. + * bio_list_on_stack[1] contains bios that were submitted before + * the current make_request_fn, but that haven't been processed + * yet. + */ + struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) @@ -1990,7 +1997,7 @@ blk_qc_t generic_make_request(struct bio *bio) * should be added at the tail */ if (current->bio_list) { - bio_list_add(current->bio_list, bio); + bio_list_add(¤t->bio_list[0], bio); goto out; } @@ -2009,18 +2016,17 @@ blk_qc_t generic_make_request(struct bio *bio) * bio_list, and call into ->make_request() again. */ BUG_ON(bio->bi_next); - bio_list_init(&bio_list_on_stack); - current->bio_list = &bio_list_on_stack; + bio_list_init(&bio_list_on_stack[0]); + current->bio_list = bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (likely(blk_queue_enter(q, false) == 0)) { - struct bio_list hold; struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ - hold = bio_list_on_stack; - bio_list_init(&bio_list_on_stack); + bio_list_on_stack[1] = bio_list_on_stack[0]; + bio_list_init(&bio_list_on_stack[0]); ret = q->make_request_fn(q, bio); blk_queue_exit(q); @@ -2030,19 +2036,19 @@ blk_qc_t generic_make_request(struct bio *bio) */ bio_list_init(&lower); bio_list_init(&same); - while ((bio = bio_list_pop(&bio_list_on_stack)) != NULL) + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) if (q == bdev_get_queue(bio->bi_bdev)) bio_list_add(&same, bio); else bio_list_add(&lower, bio); /* now assemble so we handle the lowest level first */ - bio_list_merge(&bio_list_on_stack, &lower); - bio_list_merge(&bio_list_on_stack, &same); - bio_list_merge(&bio_list_on_stack, &hold); + bio_list_merge(&bio_list_on_stack[0], &lower); + bio_list_merge(&bio_list_on_stack[0], &same); + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } else { bio_io_error(bio); } - bio = bio_list_pop(current->bio_list); + bio = bio_list_pop(&bio_list_on_stack[0]); } while (bio); current->bio_list = NULL; /* deactivate */ diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e48bc2c72615..9d97bfc4d465 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -295,6 +295,9 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) for (i = 0; i < set->nr_hw_queues; i++) { struct blk_mq_tags *tags = set->tags[i]; + if (!tags) + continue; + for (j = 0; j < tags->nr_tags; j++) { if (!tags->static_rqs[j]) continue; diff --git a/block/blk-mq.c b/block/blk-mq.c index 159187a28d66..a4546f060e80 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1434,7 +1434,8 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) +static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, + bool may_sleep) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { @@ -1475,7 +1476,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) } insert: - blk_mq_sched_insert_request(rq, false, true, true, false); + blk_mq_sched_insert_request(rq, false, true, false, may_sleep); } /* @@ -1569,11 +1570,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - blk_mq_try_issue_directly(old_rq, &cookie); + blk_mq_try_issue_directly(old_rq, &cookie, false); rcu_read_unlock(); } else { srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); - blk_mq_try_issue_directly(old_rq, &cookie); + blk_mq_try_issue_directly(old_rq, &cookie, true); srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); } goto done; diff --git a/crypto/af_alg.c b/crypto/af_alg.c index f5e18c2a4852..690deca17c35 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -266,7 +266,7 @@ unlock: return err; } -int af_alg_accept(struct sock *sk, struct socket *newsock) +int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; @@ -281,7 +281,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock) if (!type) goto unlock; - sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, 0); + sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, kern); err = -ENOMEM; if (!sk2) goto unlock; @@ -323,9 +323,10 @@ unlock: } EXPORT_SYMBOL_GPL(af_alg_accept); -static int alg_accept(struct socket *sock, struct socket *newsock, int flags) +static int alg_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { - return af_alg_accept(sock->sk, newsock); + return af_alg_accept(sock->sk, newsock, kern); } static const struct proto_ops alg_proto_ops = { diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 54fc90e8339c..5e92bd275ef3 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -239,7 +239,8 @@ unlock: return err ?: len; } -static int hash_accept(struct socket *sock, struct socket *newsock, int flags) +static int hash_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); @@ -260,7 +261,7 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags) if (err) return err; - err = af_alg_accept(ask->parent, newsock); + err = af_alg_accept(ask->parent, newsock, kern); if (err) return err; @@ -378,7 +379,7 @@ static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg, } static int hash_accept_nokey(struct socket *sock, struct socket *newsock, - int flags) + int flags, bool kern) { int err; @@ -386,7 +387,7 @@ static int hash_accept_nokey(struct socket *sock, struct socket *newsock, if (err) return err; - return hash_accept(sock, newsock, flags); + return hash_accept(sock, newsock, flags, kern); } static struct proto_ops algif_hash_ops_nokey = { diff --git a/drivers/ata/ahci_qoriq.c b/drivers/ata/ahci_qoriq.c index 85d833289f28..4c96f3ac4976 100644 --- a/drivers/ata/ahci_qoriq.c +++ b/drivers/ata/ahci_qoriq.c @@ -177,7 +177,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv) case AHCI_LS1043A: if (!qpriv->ecc_addr) return -EINVAL; - writel(ECC_DIS_ARMV8_CH2, qpriv->ecc_addr); + writel(readl(qpriv->ecc_addr) | ECC_DIS_ARMV8_CH2, + qpriv->ecc_addr); writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1); writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS); if (qpriv->is_dmacoherent) @@ -194,7 +195,8 @@ static int ahci_qoriq_phy_init(struct ahci_host_priv *hpriv) case AHCI_LS1046A: if (!qpriv->ecc_addr) return -EINVAL; - writel(ECC_DIS_ARMV8_CH2, qpriv->ecc_addr); + writel(readl(qpriv->ecc_addr) | ECC_DIS_ARMV8_CH2, + qpriv->ecc_addr); writel(AHCI_PORT_PHY_1_CFG, reg_base + PORT_PHY1); writel(AHCI_PORT_TRANS_CFG, reg_base + PORT_TRANS); if (qpriv->is_dmacoherent) diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 2bd92dca3e62..274d6d7193d7 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -1482,7 +1482,6 @@ unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc) break; default: - WARN_ON_ONCE(1); return AC_ERR_SYSTEM; } diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c index 46698232e6bf..19e6e539a061 100644 --- a/drivers/ata/libata-transport.c +++ b/drivers/ata/libata-transport.c @@ -224,7 +224,6 @@ static DECLARE_TRANSPORT_CLASS(ata_port_class, static void ata_tport_release(struct device *dev) { - put_device(dev->parent); } /** @@ -284,7 +283,7 @@ int ata_tport_add(struct device *parent, device_initialize(dev); dev->type = &ata_port_type; - dev->parent = get_device(parent); + dev->parent = parent; dev->release = ata_tport_release; dev_set_name(dev, "ata%d", ap->print_id); transport_setup_device(dev); @@ -348,7 +347,6 @@ static DECLARE_TRANSPORT_CLASS(ata_link_class, static void ata_tlink_release(struct device *dev) { - put_device(dev->parent); } /** @@ -410,7 +408,7 @@ int ata_tlink_add(struct ata_link *link) int error; device_initialize(dev); - dev->parent = get_device(&ap->tdev); + dev->parent = &ap->tdev; dev->release = ata_tlink_release; if (ata_is_host_link(link)) dev_set_name(dev, "link%d", ap->print_id); @@ -589,7 +587,6 @@ static DECLARE_TRANSPORT_CLASS(ata_dev_class, static void ata_tdev_release(struct device *dev) { - put_device(dev->parent); } /** @@ -662,7 +659,7 @@ static int ata_tdev_add(struct ata_device *ata_dev) int error; device_initialize(dev); - dev->parent = get_device(&link->tdev); + dev->parent = &link->tdev; dev->release = ata_tdev_release; if (ata_is_host_link(link)) dev_set_name(dev, "dev%d.%d", ap->print_id,ata_dev->devno); diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c index 3ad86fdf954e..b1ad12552b56 100644 --- a/drivers/char/hw_random/omap-rng.c +++ b/drivers/char/hw_random/omap-rng.c @@ -397,9 +397,8 @@ static int of_get_omap_rng_device_details(struct omap_rng_dev *priv, irq, err); return err; } - omap_rng_write(priv, RNG_INTMASK_REG, RNG_SHUTDOWN_OFLO_MASK); - priv->clk = of_clk_get(pdev->dev.of_node, 0); + priv->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(priv->clk) && PTR_ERR(priv->clk) == -EPROBE_DEFER) return -EPROBE_DEFER; if (!IS_ERR(priv->clk)) { @@ -408,6 +407,19 @@ static int of_get_omap_rng_device_details(struct omap_rng_dev *priv, dev_err(&pdev->dev, "unable to enable the clk, " "err = %d\n", err); } + + /* + * On OMAP4, enabling the shutdown_oflo interrupt is + * done in the interrupt mask register. There is no + * such register on EIP76, and it's enabled by the + * same bit in the control register + */ + if (priv->pdata->regs[RNG_INTMASK_REG]) + omap_rng_write(priv, RNG_INTMASK_REG, + RNG_SHUTDOWN_OFLO_MASK); + else + omap_rng_write(priv, RNG_CONTROL_REG, + RNG_SHUTDOWN_OFLO_MASK); } return 0; } diff --git a/drivers/char/random.c b/drivers/char/random.c index 1ef26403bcc8..0ab024918907 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -313,13 +313,6 @@ static int random_read_wakeup_bits = 64; static int random_write_wakeup_bits = 28 * OUTPUT_POOL_WORDS; /* - * The minimum number of seconds between urandom pool reseeding. We - * do this to limit the amount of entropy that can be drained from the - * input pool even if there are heavy demands on /dev/urandom. - */ -static int random_min_urandom_seed = 60; - -/* * Originally, we used a primitive polynomial of degree .poolwords * over GF(2). The taps for various sizes are defined below. They * were chosen to be evenly spaced except for the last tap, which is 1 @@ -409,7 +402,6 @@ static struct poolinfo { */ static DECLARE_WAIT_QUEUE_HEAD(random_read_wait); static DECLARE_WAIT_QUEUE_HEAD(random_write_wait); -static DECLARE_WAIT_QUEUE_HEAD(urandom_init_wait); static struct fasync_struct *fasync; static DEFINE_SPINLOCK(random_ready_list_lock); @@ -467,7 +459,6 @@ struct entropy_store { int entropy_count; int entropy_total; unsigned int initialized:1; - unsigned int limit:1; unsigned int last_data_init:1; __u8 last_data[EXTRACT_SIZE]; }; @@ -485,7 +476,6 @@ static __u32 blocking_pool_data[OUTPUT_POOL_WORDS] __latent_entropy; static struct entropy_store input_pool = { .poolinfo = &poolinfo_table[0], .name = "input", - .limit = 1, .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock), .pool = input_pool_data }; @@ -493,7 +483,6 @@ static struct entropy_store input_pool = { static struct entropy_store blocking_pool = { .poolinfo = &poolinfo_table[1], .name = "blocking", - .limit = 1, .pull = &input_pool, .lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock), .pool = blocking_pool_data, @@ -855,13 +844,6 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) spin_unlock_irqrestore(&primary_crng.lock, flags); } -static inline void maybe_reseed_primary_crng(void) -{ - if (crng_init > 2 && - time_after(jiffies, primary_crng.init_time + CRNG_RESEED_INTERVAL)) - crng_reseed(&primary_crng, &input_pool); -} - static inline void crng_wait_ready(void) { wait_event_interruptible(crng_init_wait, crng_ready()); @@ -1220,15 +1202,6 @@ static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) r->entropy_count > r->poolinfo->poolfracbits) return; - if (r->limit == 0 && random_min_urandom_seed) { - unsigned long now = jiffies; - - if (time_before(now, - r->last_pulled + random_min_urandom_seed * HZ)) - return; - r->last_pulled = now; - } - _xfer_secondary_pool(r, nbytes); } @@ -1236,8 +1209,6 @@ static void _xfer_secondary_pool(struct entropy_store *r, size_t nbytes) { __u32 tmp[OUTPUT_POOL_WORDS]; - /* For /dev/random's pool, always leave two wakeups' worth */ - int rsvd_bytes = r->limit ? 0 : random_read_wakeup_bits / 4; int bytes = nbytes; /* pull at least as much as a wakeup */ @@ -1248,7 +1219,7 @@ static void _xfer_secondary_pool(struct entropy_store *r, size_t nbytes) trace_xfer_secondary_pool(r->name, bytes * 8, nbytes * 8, ENTROPY_BITS(r), ENTROPY_BITS(r->pull)); bytes = extract_entropy(r->pull, tmp, bytes, - random_read_wakeup_bits / 8, rsvd_bytes); + random_read_wakeup_bits / 8, 0); mix_pool_bytes(r, tmp, bytes); credit_entropy_bits(r, bytes*8); } @@ -1276,7 +1247,7 @@ static void push_to_pool(struct work_struct *work) static size_t account(struct entropy_store *r, size_t nbytes, int min, int reserved) { - int entropy_count, orig; + int entropy_count, orig, have_bytes; size_t ibytes, nfrac; BUG_ON(r->entropy_count > r->poolinfo->poolfracbits); @@ -1285,14 +1256,12 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min, retry: entropy_count = orig = ACCESS_ONCE(r->entropy_count); ibytes = nbytes; - /* If limited, never pull more than available */ - if (r->limit) { - int have_bytes = entropy_count >> (ENTROPY_SHIFT + 3); + /* never pull more than available */ + have_bytes = entropy_count >> (ENTROPY_SHIFT + 3); - if ((have_bytes -= reserved) < 0) - have_bytes = 0; - ibytes = min_t(size_t, ibytes, have_bytes); - } + if ((have_bytes -= reserved) < 0) + have_bytes = 0; + ibytes = min_t(size_t, ibytes, have_bytes); if (ibytes < min) ibytes = 0; @@ -1912,6 +1881,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count, static int min_read_thresh = 8, min_write_thresh; static int max_read_thresh = OUTPUT_POOL_WORDS * 32; static int max_write_thresh = INPUT_POOL_WORDS * 32; +static int random_min_urandom_seed = 60; static char sysctl_bootid[16]; /* @@ -2042,63 +2012,64 @@ struct ctl_table random_table[] = { }; #endif /* CONFIG_SYSCTL */ -static u32 random_int_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; - -int random_int_secret_init(void) -{ - get_random_bytes(random_int_secret, sizeof(random_int_secret)); - return 0; -} - -static DEFINE_PER_CPU(__u32 [MD5_DIGEST_WORDS], get_random_int_hash) - __aligned(sizeof(unsigned long)); +struct batched_entropy { + union { + u64 entropy_u64[CHACHA20_BLOCK_SIZE / sizeof(u64)]; + u32 entropy_u32[CHACHA20_BLOCK_SIZE / sizeof(u32)]; + }; + unsigned int position; +}; /* - * Get a random word for internal kernel use only. Similar to urandom but - * with the goal of minimal entropy pool depletion. As a result, the random - * value is not cryptographically secure but for several uses the cost of - * depleting entropy is too high + * Get a random word for internal kernel use only. The quality of the random + * number is either as good as RDRAND or as good as /dev/urandom, with the + * goal of being quite fast and not depleting entropy. */ -unsigned int get_random_int(void) +static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64); +u64 get_random_u64(void) { - __u32 *hash; - unsigned int ret; + u64 ret; + struct batched_entropy *batch; - if (arch_get_random_int(&ret)) +#if BITS_PER_LONG == 64 + if (arch_get_random_long((unsigned long *)&ret)) return ret; +#else + if (arch_get_random_long((unsigned long *)&ret) && + arch_get_random_long((unsigned long *)&ret + 1)) + return ret; +#endif - hash = get_cpu_var(get_random_int_hash); - - hash[0] += current->pid + jiffies + random_get_entropy(); - md5_transform(hash, random_int_secret); - ret = hash[0]; - put_cpu_var(get_random_int_hash); - + batch = &get_cpu_var(batched_entropy_u64); + if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) { + extract_crng((u8 *)batch->entropy_u64); + batch->position = 0; + } + ret = batch->entropy_u64[batch->position++]; + put_cpu_var(batched_entropy_u64); return ret; } -EXPORT_SYMBOL(get_random_int); +EXPORT_SYMBOL(get_random_u64); -/* - * Same as get_random_int(), but returns unsigned long. - */ -unsigned long get_random_long(void) +static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32); +u32 get_random_u32(void) { - __u32 *hash; - unsigned long ret; + u32 ret; + struct batched_entropy *batch; - if (arch_get_random_long(&ret)) + if (arch_get_random_int(&ret)) return ret; - hash = get_cpu_var(get_random_int_hash); - - hash[0] += current->pid + jiffies + random_get_entropy(); - md5_transform(hash, random_int_secret); - ret = *(unsigned long *)hash; - put_cpu_var(get_random_int_hash); - + batch = &get_cpu_var(batched_entropy_u32); + if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) { + extract_crng((u8 *)batch->entropy_u32); + batch->position = 0; + } + ret = batch->entropy_u32[batch->position++]; + put_cpu_var(batched_entropy_u32); return ret; } -EXPORT_SYMBOL(get_random_long); +EXPORT_SYMBOL(get_random_u32); /** * randomize_page - Generate a random, page aligned address diff --git a/drivers/crypto/s5p-sss.c b/drivers/crypto/s5p-sss.c index dce1af0ce85c..1b9da3dc799b 100644 --- a/drivers/crypto/s5p-sss.c +++ b/drivers/crypto/s5p-sss.c @@ -270,7 +270,7 @@ static void s5p_sg_copy_buf(void *buf, struct scatterlist *sg, scatterwalk_done(&walk, out, 0); } -static void s5p_aes_complete(struct s5p_aes_dev *dev, int err) +static void s5p_sg_done(struct s5p_aes_dev *dev) { if (dev->sg_dst_cpy) { dev_dbg(dev->dev, @@ -281,8 +281,11 @@ static void s5p_aes_complete(struct s5p_aes_dev *dev, int err) } s5p_free_sg_cpy(dev, &dev->sg_src_cpy); s5p_free_sg_cpy(dev, &dev->sg_dst_cpy); +} - /* holding a lock outside */ +/* Calls the completion. Cannot be called with dev->lock hold. */ +static void s5p_aes_complete(struct s5p_aes_dev *dev, int err) +{ dev->req->base.complete(&dev->req->base, err); dev->busy = false; } @@ -368,51 +371,44 @@ exit: } /* - * Returns true if new transmitting (output) data is ready and its - * address+length have to be written to device (by calling - * s5p_set_dma_outdata()). False otherwise. + * Returns -ERRNO on error (mapping of new data failed). + * On success returns: + * - 0 if there is no more data, + * - 1 if new transmitting (output) data is ready and its address+length + * have to be written to device (by calling s5p_set_dma_outdata()). */ -static bool s5p_aes_tx(struct s5p_aes_dev *dev) +static int s5p_aes_tx(struct s5p_aes_dev *dev) { - int err = 0; - bool ret = false; + int ret = 0; s5p_unset_outdata(dev); if (!sg_is_last(dev->sg_dst)) { - err = s5p_set_outdata(dev, sg_next(dev->sg_dst)); - if (err) - s5p_aes_complete(dev, err); - else - ret = true; - } else { - s5p_aes_complete(dev, err); - - dev->busy = true; - tasklet_schedule(&dev->tasklet); + ret = s5p_set_outdata(dev, sg_next(dev->sg_dst)); + if (!ret) + ret = 1; } return ret; } /* - * Returns true if new receiving (input) data is ready and its - * address+length have to be written to device (by calling - * s5p_set_dma_indata()). False otherwise. + * Returns -ERRNO on error (mapping of new data failed). + * On success returns: + * - 0 if there is no more data, + * - 1 if new receiving (input) data is ready and its address+length + * have to be written to device (by calling s5p_set_dma_indata()). */ -static bool s5p_aes_rx(struct s5p_aes_dev *dev) +static int s5p_aes_rx(struct s5p_aes_dev *dev/*, bool *set_dma*/) { - int err; - bool ret = false; + int ret = 0; s5p_unset_indata(dev); if (!sg_is_last(dev->sg_src)) { - err = s5p_set_indata(dev, sg_next(dev->sg_src)); - if (err) - s5p_aes_complete(dev, err); - else - ret = true; + ret = s5p_set_indata(dev, sg_next(dev->sg_src)); + if (!ret) + ret = 1; } return ret; @@ -422,33 +418,73 @@ static irqreturn_t s5p_aes_interrupt(int irq, void *dev_id) { struct platform_device *pdev = dev_id; struct s5p_aes_dev *dev = platform_get_drvdata(pdev); - bool set_dma_tx = false; - bool set_dma_rx = false; + int err_dma_tx = 0; + int err_dma_rx = 0; + bool tx_end = false; unsigned long flags; uint32_t status; + int err; spin_lock_irqsave(&dev->lock, flags); + /* + * Handle rx or tx interrupt. If there is still data (scatterlist did not + * reach end), then map next scatterlist entry. + * In case of such mapping error, s5p_aes_complete() should be called. + * + * If there is no more data in tx scatter list, call s5p_aes_complete() + * and schedule new tasklet. + */ status = SSS_READ(dev, FCINTSTAT); if (status & SSS_FCINTSTAT_BRDMAINT) - set_dma_rx = s5p_aes_rx(dev); - if (status & SSS_FCINTSTAT_BTDMAINT) - set_dma_tx = s5p_aes_tx(dev); + err_dma_rx = s5p_aes_rx(dev); + + if (status & SSS_FCINTSTAT_BTDMAINT) { + if (sg_is_last(dev->sg_dst)) + tx_end = true; + err_dma_tx = s5p_aes_tx(dev); + } SSS_WRITE(dev, FCINTPEND, status); - /* - * Writing length of DMA block (either receiving or transmitting) - * will start the operation immediately, so this should be done - * at the end (even after clearing pending interrupts to not miss the - * interrupt). - */ - if (set_dma_tx) - s5p_set_dma_outdata(dev, dev->sg_dst); - if (set_dma_rx) - s5p_set_dma_indata(dev, dev->sg_src); + if (err_dma_rx < 0) { + err = err_dma_rx; + goto error; + } + if (err_dma_tx < 0) { + err = err_dma_tx; + goto error; + } + + if (tx_end) { + s5p_sg_done(dev); + + spin_unlock_irqrestore(&dev->lock, flags); + + s5p_aes_complete(dev, 0); + dev->busy = true; + tasklet_schedule(&dev->tasklet); + } else { + /* + * Writing length of DMA block (either receiving or + * transmitting) will start the operation immediately, so this + * should be done at the end (even after clearing pending + * interrupts to not miss the interrupt). + */ + if (err_dma_tx == 1) + s5p_set_dma_outdata(dev, dev->sg_dst); + if (err_dma_rx == 1) + s5p_set_dma_indata(dev, dev->sg_src); + spin_unlock_irqrestore(&dev->lock, flags); + } + + return IRQ_HANDLED; + +error: + s5p_sg_done(dev); spin_unlock_irqrestore(&dev->lock, flags); + s5p_aes_complete(dev, err); return IRQ_HANDLED; } @@ -597,8 +633,9 @@ outdata_error: s5p_unset_indata(dev); indata_error: - s5p_aes_complete(dev, err); + s5p_sg_done(dev); spin_unlock_irqrestore(&dev->lock, flags); + s5p_aes_complete(dev, err); } static void s5p_tasklet_cb(unsigned long data) @@ -805,8 +842,9 @@ static int s5p_aes_probe(struct platform_device *pdev) dev_warn(dev, "feed control interrupt is not available.\n"); goto err_irq; } - err = devm_request_irq(dev, pdata->irq_fc, s5p_aes_interrupt, - IRQF_SHARED, pdev->name, pdev); + err = devm_request_threaded_irq(dev, pdata->irq_fc, NULL, + s5p_aes_interrupt, IRQF_ONESHOT, + pdev->name, pdev); if (err < 0) { dev_warn(dev, "feed control interrupt is not available.\n"); goto err_irq; diff --git a/drivers/irqchip/irq-crossbar.c b/drivers/irqchip/irq-crossbar.c index 05bbf171df37..f96601268f71 100644 --- a/drivers/irqchip/irq-crossbar.c +++ b/drivers/irqchip/irq-crossbar.c @@ -198,8 +198,8 @@ static const struct irq_domain_ops crossbar_domain_ops = { static int __init crossbar_of_init(struct device_node *node) { + u32 max = 0, entry, reg_size; int i, size, reserved = 0; - u32 max = 0, entry; const __be32 *irqsr; int ret = -ENOMEM; @@ -276,9 +276,9 @@ static int __init crossbar_of_init(struct device_node *node) if (!cb->register_offsets) goto err_irq_map; - of_property_read_u32(node, "ti,reg-size", &size); + of_property_read_u32(node, "ti,reg-size", ®_size); - switch (size) { + switch (reg_size) { case 1: cb->write = crossbar_writeb; break; @@ -304,7 +304,7 @@ static int __init crossbar_of_init(struct device_node *node) continue; cb->register_offsets[i] = reserved; - reserved += size; + reserved += reg_size; } of_property_read_u32(node, "ti,irqs-safe-map", &cb->safe_map); diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 23201004fd7a..f77f840d2b5f 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1601,6 +1601,14 @@ static void __maybe_unused its_enable_quirk_cavium_23144(void *data) its->flags |= ITS_FLAGS_WORKAROUND_CAVIUM_23144; } +static void __maybe_unused its_enable_quirk_qdf2400_e0065(void *data) +{ + struct its_node *its = data; + + /* On QDF2400, the size of the ITE is 16Bytes */ + its->ite_size = 16; +} + static const struct gic_quirk its_quirks[] = { #ifdef CONFIG_CAVIUM_ERRATUM_22375 { @@ -1618,6 +1626,14 @@ static const struct gic_quirk its_quirks[] = { .init = its_enable_quirk_cavium_23144, }, #endif +#ifdef CONFIG_QCOM_QDF2400_ERRATUM_0065 + { + .desc = "ITS: QDF2400 erratum 0065", + .iidr = 0x00001070, /* QDF2400 ITS rev 1.x */ + .mask = 0xffffffff, + .init = its_enable_quirk_qdf2400_e0065, + }, +#endif { } }; diff --git a/drivers/isdn/gigaset/bas-gigaset.c b/drivers/isdn/gigaset/bas-gigaset.c index 11e13c56126f..2da3ff650e1d 100644 --- a/drivers/isdn/gigaset/bas-gigaset.c +++ b/drivers/isdn/gigaset/bas-gigaset.c @@ -2317,6 +2317,9 @@ static int gigaset_probe(struct usb_interface *interface, return -ENODEV; } + if (hostif->desc.bNumEndpoints < 1) + return -ENODEV; + dev_info(&udev->dev, "%s: Device matched (Vendor: 0x%x, Product: 0x%x)\n", __func__, le16_to_cpu(udev->descriptor.idVendor), diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index d71f6323ac00..b4f79b923aea 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void) * byte, not the size, hence the "-1"). */ state->host_gdt_desc.size = GDT_SIZE-1; - state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); + state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i); /* * All CPUs on the Host use the same Interrupt Descriptor @@ -554,8 +554,8 @@ void __init lguest_arch_host_init(void) * The Host needs to be able to use the LGUEST segments on this * CPU, too, so put them in the Host GDT. */ - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; + get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; + get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } /* diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c index 3f041b187033..f757cef293f8 100644 --- a/drivers/macintosh/macio_asic.c +++ b/drivers/macintosh/macio_asic.c @@ -392,6 +392,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, * To get all the fields, copy all archdata */ dev->ofdev.dev.archdata = chip->lbus.pdev->dev.archdata; + dev->ofdev.dev.dma_ops = chip->lbus.pdev->dev.dma_ops; #endif /* CONFIG_PCI */ #ifdef DEBUG diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f4ffd1eb8f44..dfb75979e455 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -989,26 +989,29 @@ static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) struct dm_offload *o = container_of(cb, struct dm_offload, cb); struct bio_list list; struct bio *bio; + int i; INIT_LIST_HEAD(&o->cb.list); if (unlikely(!current->bio_list)) return; - list = *current->bio_list; - bio_list_init(current->bio_list); - - while ((bio = bio_list_pop(&list))) { - struct bio_set *bs = bio->bi_pool; - if (unlikely(!bs) || bs == fs_bio_set) { - bio_list_add(current->bio_list, bio); - continue; + for (i = 0; i < 2; i++) { + list = current->bio_list[i]; + bio_list_init(¤t->bio_list[i]); + + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(¤t->bio_list[i], bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); } - - spin_lock(&bs->rescue_lock); - bio_list_add(&bs->rescue_list, bio); - queue_work(bs->rescue_workqueue, &bs->rescue_work); - spin_unlock(&bs->rescue_lock); } } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 2b13117fb918..321ecac23027 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -777,7 +777,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) bm_lockres->flags |= DLM_LKF_NOQUEUE; ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (ret == -EAGAIN) { - memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); s = read_resync_info(mddev, bm_lockres); if (s) { pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", @@ -974,6 +973,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); dlm_release_lockspace(cinfo->lockspace, 2); + kfree(cinfo); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 548d1b8014f8..f6ae1d67bcd0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -440,14 +440,6 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -void md_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct mddev *mddev = cb->data; - md_wakeup_thread(mddev->thread); - kfree(cb); -} -EXPORT_SYMBOL(md_unplug); - static inline struct mddev *mddev_get(struct mddev *mddev) { atomic_inc(&mddev->active); @@ -1887,7 +1879,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); - sb->super_offset = rdev->sb_start; + sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -2295,7 +2287,7 @@ static bool does_sb_need_changing(struct mddev *mddev) /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || - (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; @@ -6458,11 +6450,10 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->layout = info->layout; mddev->chunk_sectors = info->chunk_size >> 9; - mddev->max_disks = MD_SB_DISKS; - if (mddev->persistent) { - mddev->flags = 0; - mddev->sb_flags = 0; + mddev->max_disks = MD_SB_DISKS; + mddev->flags = 0; + mddev->sb_flags = 0; } set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -6533,8 +6524,12 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) - revalidate_disk(mddev->gendisk); + if (!rv) { + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + } return rv; } diff --git a/drivers/md/md.h b/drivers/md/md.h index b8859cbf84b6..dde8ecb760c8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -676,16 +676,10 @@ extern void mddev_resume(struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); -static inline int mddev_check_plugged(struct mddev *mddev) -{ - return !!blk_check_plugged(md_unplug, mddev, - sizeof(struct blk_plug_cb)); -} static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fbc2d7851b49..a34f58772022 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1027,7 +1027,7 @@ static int get_unqueued_pending(struct r1conf *conf) static void freeze_array(struct r1conf *conf, int extra) { /* Stop sync I/O and normal I/O and wait for everything to - * go quite. + * go quiet. * This is called in two situations: * 1) management command handlers (reshape, remove disk, quiesce). * 2) one normal I/O request failed. @@ -1587,9 +1587,30 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio) split = bio; } - if (bio_data_dir(split) == READ) + if (bio_data_dir(split) == READ) { raid1_read_request(mddev, split); - else + + /* + * If a bio is splitted, the first part of bio will + * pass barrier but the bio is queued in + * current->bio_list (see generic_make_request). If + * there is a raise_barrier() called here, the second + * part of bio can't pass barrier. But since the first + * part bio isn't dispatched to underlaying disks yet, + * the barrier is never released, hence raise_barrier + * will alays wait. We have a deadlock. + * Note, this only happens in read path. For write + * path, the first part of bio is dispatched in a + * schedule() call (because of blk plug) or offloaded + * to raid10d. + * Quitting from the function immediately can change + * the bio order queued in bio_list and avoid the deadlock. + */ + if (split != bio) { + generic_make_request(bio); + break; + } + } else raid1_write_request(mddev, split); } while (split != bio); } @@ -3246,8 +3267,6 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, newsize); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 063c43d83b72..e89a8d78a9ed 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -974,7 +974,8 @@ static void wait_barrier(struct r10conf *conf) !conf->barrier || (atomic_read(&conf->nr_pending) && current->bio_list && - !bio_list_empty(current->bio_list)), + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) @@ -1477,11 +1478,24 @@ retry_write: mbio->bi_bdev = (void*)rdev; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } } @@ -1571,7 +1585,25 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) split = bio; } + /* + * If a bio is splitted, the first part of bio will pass + * barrier but the bio is queued in current->bio_list (see + * generic_make_request). If there is a raise_barrier() called + * here, the second part of bio can't pass barrier. But since + * the first part bio isn't dispatched to underlaying disks + * yet, the barrier is never released, hence raise_barrier will + * alays wait. We have a deadlock. + * Note, this only happens in read path. For write path, the + * first part of bio is dispatched in a schedule() call + * (because of blk plug) or offloaded to raid10d. + * Quitting from the function immediately can change the bio + * order queued in bio_list and avoid the deadlock. + */ __make_request(mddev, split); + if (split != bio && bio_data_dir(bio) == READ) { + generic_make_request(bio); + break; + } } while (split != bio); /* In case raid10d snuck in to freeze_array */ @@ -3943,10 +3975,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, size); - if (mddev->queue) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - } if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { mddev->recovery_cp = oldsize; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4fb09b3fcb41..ed5cd705b985 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1401,7 +1401,8 @@ static int set_syndrome_sources(struct page **srcs, (test_bit(R5_Wantdrain, &dev->flags) || test_bit(R5_InJournal, &dev->flags))) || (srctype == SYNDROME_SRC_WRITTEN && - dev->written)) { + (dev->written || + test_bit(R5_InJournal, &dev->flags)))) { if (test_bit(R5_InJournal, &dev->flags)) srcs[slot] = sh->dev[i].orig_page; else @@ -7605,8 +7606,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, newsize); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 248f60d171a5..ffea9859f5a7 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -2272,10 +2272,7 @@ static int xgbe_one_poll(struct napi_struct *napi, int budget) processed = xgbe_rx_poll(channel, budget); /* If we processed everything, we are done */ - if (processed < budget) { - /* Turn off polling */ - napi_complete_done(napi, processed); - + if ((processed < budget) && napi_complete_done(napi, processed)) { /* Enable Tx and Rx interrupts */ if (pdata->channel_irq_mode) xgbe_enable_rx_tx_int(pdata, channel); @@ -2317,10 +2314,7 @@ static int xgbe_all_poll(struct napi_struct *napi, int budget) } while ((processed < budget) && (processed != last_processed)); /* If we processed everything, we are done */ - if (processed < budget) { - /* Turn off polling */ - napi_complete_done(napi, processed); - + if ((processed < budget) && napi_complete_done(napi, processed)) { /* Enable Tx and Rx interrupts */ xgbe_enable_rx_tx_ints(pdata); } diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index 581de71a958a..4c6c882c6a1c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -213,9 +213,9 @@ void aq_pci_func_free_irqs(struct aq_pci_func_s *self) if (!((1U << i) & self->msix_entry_mask)) continue; - free_irq(pci_irq_vector(pdev, i), self->aq_vec[i]); if (pdev->msix_enabled) irq_set_affinity_hint(pci_irq_vector(pdev, i), NULL); + free_irq(pci_irq_vector(pdev, i), self->aq_vec[i]); self->msix_entry_mask &= ~(1U << i); } } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index d8d06fdfc42b..ac76fc251d26 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -13292,17 +13292,15 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev, dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_HIGHDMA; - /* VF with OLD Hypervisor or old PF do not support filtering */ if (IS_PF(bp)) { if (chip_is_e1x) bp->accept_any_vlan = true; else dev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; -#ifdef CONFIG_BNX2X_SRIOV - } else if (bp->acquire_resp.pfdev_info.pf_cap & PFVF_CAP_VLAN_FILTER) { - dev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; -#endif } + /* For VF we'll know whether to enable VLAN filtering after + * getting a response to CHANNEL_TLV_ACQUIRE from PF. + */ dev->features |= dev->hw_features | NETIF_F_HW_VLAN_CTAG_RX; dev->features |= NETIF_F_HIGHDMA; @@ -13738,7 +13736,7 @@ static int bnx2x_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb) if (!netif_running(bp->dev)) { DP(BNX2X_MSG_PTP, "PTP adjfreq called while the interface is down\n"); - return -EFAULT; + return -ENETDOWN; } if (ppb < 0) { @@ -13797,6 +13795,12 @@ static int bnx2x_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct bnx2x *bp = container_of(ptp, struct bnx2x, ptp_clock_info); + if (!netif_running(bp->dev)) { + DP(BNX2X_MSG_PTP, + "PTP adjtime called while the interface is down\n"); + return -ENETDOWN; + } + DP(BNX2X_MSG_PTP, "PTP adjtime called, delta = %llx\n", delta); timecounter_adjtime(&bp->timecounter, delta); @@ -13809,6 +13813,12 @@ static int bnx2x_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) struct bnx2x *bp = container_of(ptp, struct bnx2x, ptp_clock_info); u64 ns; + if (!netif_running(bp->dev)) { + DP(BNX2X_MSG_PTP, + "PTP gettime called while the interface is down\n"); + return -ENETDOWN; + } + ns = timecounter_read(&bp->timecounter); DP(BNX2X_MSG_PTP, "PTP gettime called, ns = %llu\n", ns); @@ -13824,6 +13834,12 @@ static int bnx2x_ptp_settime(struct ptp_clock_info *ptp, struct bnx2x *bp = container_of(ptp, struct bnx2x, ptp_clock_info); u64 ns; + if (!netif_running(bp->dev)) { + DP(BNX2X_MSG_PTP, + "PTP settime called while the interface is down\n"); + return -ENETDOWN; + } + ns = timespec64_to_ns(ts); DP(BNX2X_MSG_PTP, "PTP settime called, ns = %llu\n", ns); @@ -13991,6 +14007,14 @@ static int bnx2x_init_one(struct pci_dev *pdev, rc = bnx2x_vfpf_acquire(bp, tx_count, rx_count); if (rc) goto init_one_freemem; + +#ifdef CONFIG_BNX2X_SRIOV + /* VF with OLD Hypervisor or old PF do not support filtering */ + if (bp->acquire_resp.pfdev_info.pf_cap & PFVF_CAP_VLAN_FILTER) { + dev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; + dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; + } +#endif } /* Enable SRIOV if capability found in configuration space */ diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 6fad22adbbb9..bdfd53b46bc5 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -434,7 +434,9 @@ static int bnx2x_vf_mac_vlan_config(struct bnx2x *bp, /* Add/Remove the filter */ rc = bnx2x_config_vlan_mac(bp, &ramrod); - if (rc && rc != -EEXIST) { + if (rc == -EEXIST) + return 0; + if (rc) { BNX2X_ERR("Failed to %s %s\n", filter->add ? "add" : "delete", (filter->type == BNX2X_VF_FILTER_VLAN_MAC) ? @@ -444,6 +446,8 @@ static int bnx2x_vf_mac_vlan_config(struct bnx2x *bp, return rc; } + filter->applied = true; + return 0; } @@ -469,8 +473,10 @@ int bnx2x_vf_mac_vlan_config_list(struct bnx2x *bp, struct bnx2x_virtf *vf, /* Rollback if needed */ if (i != filters->count) { BNX2X_ERR("Managed only %d/%d filters - rolling back\n", - i, filters->count + 1); + i, filters->count); while (--i >= 0) { + if (!filters->filters[i].applied) + continue; filters->filters[i].add = !filters->filters[i].add; bnx2x_vf_mac_vlan_config(bp, vf, qid, &filters->filters[i], @@ -1899,7 +1905,8 @@ void bnx2x_iov_adjust_stats_req(struct bnx2x *bp) continue; } - DP(BNX2X_MSG_IOV, "add addresses for vf %d\n", vf->abs_vfid); + DP_AND((BNX2X_MSG_IOV | BNX2X_MSG_STATS), + "add addresses for vf %d\n", vf->abs_vfid); for_each_vfq(vf, j) { struct bnx2x_vf_queue *rxq = vfq_get(vf, j); @@ -1920,11 +1927,12 @@ void bnx2x_iov_adjust_stats_req(struct bnx2x *bp) cpu_to_le32(U64_HI(q_stats_addr)); cur_query_entry->address.lo = cpu_to_le32(U64_LO(q_stats_addr)); - DP(BNX2X_MSG_IOV, - "added address %x %x for vf %d queue %d client %d\n", - cur_query_entry->address.hi, - cur_query_entry->address.lo, cur_query_entry->funcID, - j, cur_query_entry->index); + DP_AND((BNX2X_MSG_IOV | BNX2X_MSG_STATS), + "added address %x %x for vf %d queue %d client %d\n", + cur_query_entry->address.hi, + cur_query_entry->address.lo, + cur_query_entry->funcID, + j, cur_query_entry->index); cur_query_entry++; cur_data_offset += sizeof(struct per_queue_stats); stats_count++; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h index 7a6d406f4c11..888d0b6632e8 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h @@ -114,6 +114,7 @@ struct bnx2x_vf_mac_vlan_filter { (BNX2X_VF_FILTER_MAC | BNX2X_VF_FILTER_VLAN) /*shortcut*/ bool add; + bool applied; u8 *mac; u16 vid; }; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c index bfae300cf25f..76a4668c50fe 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c @@ -868,7 +868,7 @@ int bnx2x_vfpf_set_mcast(struct net_device *dev) struct bnx2x *bp = netdev_priv(dev); struct vfpf_set_q_filters_tlv *req = &bp->vf2pf_mbox->req.set_q_filters; struct pfvf_general_resp_tlv *resp = &bp->vf2pf_mbox->resp.general_resp; - int rc, i = 0; + int rc = 0, i = 0; struct netdev_hw_addr *ha; if (bp->state != BNX2X_STATE_OPEN) { @@ -883,6 +883,15 @@ int bnx2x_vfpf_set_mcast(struct net_device *dev) /* Get Rx mode requested */ DP(NETIF_MSG_IFUP, "dev->flags = %x\n", dev->flags); + /* We support PFVF_MAX_MULTICAST_PER_VF mcast addresses tops */ + if (netdev_mc_count(dev) > PFVF_MAX_MULTICAST_PER_VF) { + DP(NETIF_MSG_IFUP, + "VF supports not more than %d multicast MAC addresses\n", + PFVF_MAX_MULTICAST_PER_VF); + rc = -EINVAL; + goto out; + } + netdev_for_each_mc_addr(ha, dev) { DP(NETIF_MSG_IFUP, "Adding mcast MAC: %pM\n", bnx2x_mc_addr(ha)); @@ -890,16 +899,6 @@ int bnx2x_vfpf_set_mcast(struct net_device *dev) i++; } - /* We support four PFVF_MAX_MULTICAST_PER_VF mcast - * addresses tops - */ - if (i >= PFVF_MAX_MULTICAST_PER_VF) { - DP(NETIF_MSG_IFUP, - "VF supports not more than %d multicast MAC addresses\n", - PFVF_MAX_MULTICAST_PER_VF); - return -EINVAL; - } - req->n_multicast = i; req->flags |= VFPF_SET_Q_FILTERS_MULTICAST_CHANGED; req->vf_qid = 0; @@ -924,7 +923,7 @@ int bnx2x_vfpf_set_mcast(struct net_device *dev) out: bnx2x_vfpf_finalize(bp, &req->first_tlv); - return 0; + return rc; } /* request pf to add a vlan for the vf */ @@ -1778,6 +1777,23 @@ static int bnx2x_vf_mbx_qfilters(struct bnx2x *bp, struct bnx2x_virtf *vf) goto op_err; } + /* build vlan list */ + fl = NULL; + + rc = bnx2x_vf_mbx_macvlan_list(bp, vf, msg, &fl, + VFPF_VLAN_FILTER); + if (rc) + goto op_err; + + if (fl) { + /* set vlan list */ + rc = bnx2x_vf_mac_vlan_config_list(bp, vf, fl, + msg->vf_qid, + false); + if (rc) + goto op_err; + } + } if (msg->flags & VFPF_SET_Q_FILTERS_RX_MASK_CHANGED) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 235733e91c79..32de4589d16a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4465,6 +4465,10 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp) vf->vlan = le16_to_cpu(resp->vlan) & VLAN_VID_MASK; } #endif + if (BNXT_PF(bp) && (le16_to_cpu(resp->flags) & + FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED)) + bp->flags |= BNXT_FLAG_FW_LLDP_AGENT; + switch (resp->port_partition_type) { case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_0: case FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_5: @@ -5507,8 +5511,9 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp) bp->lpi_tmr_hi = le32_to_cpu(resp->valid_tx_lpi_timer_high) & PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK; } - link_info->support_auto_speeds = - le16_to_cpu(resp->supported_speeds_auto_mode); + if (resp->supported_speeds_auto_mode) + link_info->support_auto_speeds = + le16_to_cpu(resp->supported_speeds_auto_mode); hwrm_phy_qcaps_exit: mutex_unlock(&bp->hwrm_cmd_lock); @@ -6495,8 +6500,14 @@ static void bnxt_reset_task(struct bnxt *bp, bool silent) if (!silent) bnxt_dbg_dump_states(bp); if (netif_running(bp->dev)) { + int rc; + + if (!silent) + bnxt_ulp_stop(bp); bnxt_close_nic(bp, false, false); - bnxt_open_nic(bp, false, false); + rc = bnxt_open_nic(bp, false, false); + if (!silent && !rc) + bnxt_ulp_start(bp); } } @@ -7444,6 +7455,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (rc) goto init_err_pci_clean; + rc = bnxt_hwrm_func_reset(bp); + if (rc) + goto init_err_pci_clean; + bnxt_hwrm_fw_set_time(bp); dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG | @@ -7554,10 +7569,6 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (rc) goto init_err_pci_clean; - rc = bnxt_hwrm_func_reset(bp); - if (rc) - goto init_err_pci_clean; - rc = bnxt_init_int_mode(bp); if (rc) goto init_err_pci_clean; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index faf26a2f726b..c7a5b84a5cb2 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -993,6 +993,7 @@ struct bnxt { BNXT_FLAG_ROCEV2_CAP) #define BNXT_FLAG_NO_AGG_RINGS 0x20000 #define BNXT_FLAG_RX_PAGE_MODE 0x40000 + #define BNXT_FLAG_FW_LLDP_AGENT 0x80000 #define BNXT_FLAG_CHIP_NITRO_A0 0x1000000 #define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA | \ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c index fdf2d8caf7bf..03532061d211 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c @@ -474,7 +474,7 @@ void bnxt_dcb_init(struct bnxt *bp) return; bp->dcbx_cap = DCB_CAP_DCBX_VER_IEEE; - if (BNXT_PF(bp)) + if (BNXT_PF(bp) && !(bp->flags & BNXT_FLAG_FW_LLDP_AGENT)) bp->dcbx_cap |= DCB_CAP_DCBX_HOST; else bp->dcbx_cap |= DCB_CAP_DCBX_LLD_MANAGED; diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index f92896835d2a..69015fa50f20 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -1,7 +1,7 @@ /* * Broadcom GENET (Gigabit Ethernet) controller driver * - * Copyright (c) 2014 Broadcom Corporation + * Copyright (c) 2014-2017 Broadcom * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -450,6 +450,22 @@ static inline void bcmgenet_rdma_ring_writel(struct bcmgenet_priv *priv, genet_dma_ring_regs[r]); } +static int bcmgenet_begin(struct net_device *dev) +{ + struct bcmgenet_priv *priv = netdev_priv(dev); + + /* Turn on the clock */ + return clk_prepare_enable(priv->clk); +} + +static void bcmgenet_complete(struct net_device *dev) +{ + struct bcmgenet_priv *priv = netdev_priv(dev); + + /* Turn off the clock */ + clk_disable_unprepare(priv->clk); +} + static int bcmgenet_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { @@ -778,8 +794,9 @@ static const struct bcmgenet_stats bcmgenet_gstrings_stats[] = { STAT_GENET_RUNT("rx_runt_bytes", mib.rx_runt_bytes), /* Misc UniMAC counters */ STAT_GENET_MISC("rbuf_ovflow_cnt", mib.rbuf_ovflow_cnt, - UMAC_RBUF_OVFL_CNT), - STAT_GENET_MISC("rbuf_err_cnt", mib.rbuf_err_cnt, UMAC_RBUF_ERR_CNT), + UMAC_RBUF_OVFL_CNT_V1), + STAT_GENET_MISC("rbuf_err_cnt", mib.rbuf_err_cnt, + UMAC_RBUF_ERR_CNT_V1), STAT_GENET_MISC("mdf_err_cnt", mib.mdf_err_cnt, UMAC_MDF_ERR_CNT), STAT_GENET_SOFT_MIB("alloc_rx_buff_failed", mib.alloc_rx_buff_failed), STAT_GENET_SOFT_MIB("rx_dma_failed", mib.rx_dma_failed), @@ -821,6 +838,45 @@ static void bcmgenet_get_strings(struct net_device *dev, u32 stringset, } } +static u32 bcmgenet_update_stat_misc(struct bcmgenet_priv *priv, u16 offset) +{ + u16 new_offset; + u32 val; + + switch (offset) { + case UMAC_RBUF_OVFL_CNT_V1: + if (GENET_IS_V2(priv)) + new_offset = RBUF_OVFL_CNT_V2; + else + new_offset = RBUF_OVFL_CNT_V3PLUS; + + val = bcmgenet_rbuf_readl(priv, new_offset); + /* clear if overflowed */ + if (val == ~0) + bcmgenet_rbuf_writel(priv, 0, new_offset); + break; + case UMAC_RBUF_ERR_CNT_V1: + if (GENET_IS_V2(priv)) + new_offset = RBUF_ERR_CNT_V2; + else + new_offset = RBUF_ERR_CNT_V3PLUS; + + val = bcmgenet_rbuf_readl(priv, new_offset); + /* clear if overflowed */ + if (val == ~0) + bcmgenet_rbuf_writel(priv, 0, new_offset); + break; + default: + val = bcmgenet_umac_readl(priv, offset); + /* clear if overflowed */ + if (val == ~0) + bcmgenet_umac_writel(priv, 0, offset); + break; + } + + return val; +} + static void bcmgenet_update_mib_counters(struct bcmgenet_priv *priv) { int i, j = 0; @@ -836,19 +892,28 @@ static void bcmgenet_update_mib_counters(struct bcmgenet_priv *priv) case BCMGENET_STAT_NETDEV: case BCMGENET_STAT_SOFT: continue; - case BCMGENET_STAT_MIB_RX: - case BCMGENET_STAT_MIB_TX: case BCMGENET_STAT_RUNT: - if (s->type != BCMGENET_STAT_MIB_RX) - offset = BCMGENET_STAT_OFFSET; + offset += BCMGENET_STAT_OFFSET; + /* fall through */ + case BCMGENET_STAT_MIB_TX: + offset += BCMGENET_STAT_OFFSET; + /* fall through */ + case BCMGENET_STAT_MIB_RX: val = bcmgenet_umac_readl(priv, UMAC_MIB_START + j + offset); + offset = 0; /* Reset Offset */ break; case BCMGENET_STAT_MISC: - val = bcmgenet_umac_readl(priv, s->reg_offset); - /* clear if overflowed */ - if (val == ~0) - bcmgenet_umac_writel(priv, 0, s->reg_offset); + if (GENET_IS_V1(priv)) { + val = bcmgenet_umac_readl(priv, s->reg_offset); + /* clear if overflowed */ + if (val == ~0) + bcmgenet_umac_writel(priv, 0, + s->reg_offset); + } else { + val = bcmgenet_update_stat_misc(priv, + s->reg_offset); + } break; } @@ -973,6 +1038,8 @@ static int bcmgenet_set_eee(struct net_device *dev, struct ethtool_eee *e) /* standard ethtool support functions. */ static const struct ethtool_ops bcmgenet_ethtool_ops = { + .begin = bcmgenet_begin, + .complete = bcmgenet_complete, .get_strings = bcmgenet_get_strings, .get_sset_count = bcmgenet_get_sset_count, .get_ethtool_stats = bcmgenet_get_ethtool_stats, @@ -1167,7 +1234,6 @@ static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev, struct bcmgenet_priv *priv = netdev_priv(dev); struct device *kdev = &priv->pdev->dev; struct enet_cb *tx_cb_ptr; - struct netdev_queue *txq; unsigned int pkts_compl = 0; unsigned int bytes_compl = 0; unsigned int c_index; @@ -1219,13 +1285,8 @@ static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev, dev->stats.tx_packets += pkts_compl; dev->stats.tx_bytes += bytes_compl; - txq = netdev_get_tx_queue(dev, ring->queue); - netdev_tx_completed_queue(txq, pkts_compl, bytes_compl); - - if (ring->free_bds > (MAX_SKB_FRAGS + 1)) { - if (netif_tx_queue_stopped(txq)) - netif_tx_wake_queue(txq); - } + netdev_tx_completed_queue(netdev_get_tx_queue(dev, ring->queue), + pkts_compl, bytes_compl); return pkts_compl; } @@ -1248,8 +1309,16 @@ static int bcmgenet_tx_poll(struct napi_struct *napi, int budget) struct bcmgenet_tx_ring *ring = container_of(napi, struct bcmgenet_tx_ring, napi); unsigned int work_done = 0; + struct netdev_queue *txq; + unsigned long flags; - work_done = bcmgenet_tx_reclaim(ring->priv->dev, ring); + spin_lock_irqsave(&ring->lock, flags); + work_done = __bcmgenet_tx_reclaim(ring->priv->dev, ring); + if (ring->free_bds > (MAX_SKB_FRAGS + 1)) { + txq = netdev_get_tx_queue(ring->priv->dev, ring->queue); + netif_tx_wake_queue(txq); + } + spin_unlock_irqrestore(&ring->lock, flags); if (work_done == 0) { napi_complete(napi); @@ -2457,24 +2526,28 @@ static int bcmgenet_init_dma(struct bcmgenet_priv *priv) /* Interrupt bottom half */ static void bcmgenet_irq_task(struct work_struct *work) { + unsigned long flags; + unsigned int status; struct bcmgenet_priv *priv = container_of( work, struct bcmgenet_priv, bcmgenet_irq_work); netif_dbg(priv, intr, priv->dev, "%s\n", __func__); - if (priv->irq0_stat & UMAC_IRQ_MPD_R) { - priv->irq0_stat &= ~UMAC_IRQ_MPD_R; + spin_lock_irqsave(&priv->lock, flags); + status = priv->irq0_stat; + priv->irq0_stat = 0; + spin_unlock_irqrestore(&priv->lock, flags); + + if (status & UMAC_IRQ_MPD_R) { netif_dbg(priv, wol, priv->dev, "magic packet detected, waking up\n"); bcmgenet_power_up(priv, GENET_POWER_WOL_MAGIC); } /* Link UP/DOWN event */ - if (priv->irq0_stat & UMAC_IRQ_LINK_EVENT) { + if (status & UMAC_IRQ_LINK_EVENT) phy_mac_interrupt(priv->phydev, - !!(priv->irq0_stat & UMAC_IRQ_LINK_UP)); - priv->irq0_stat &= ~UMAC_IRQ_LINK_EVENT; - } + !!(status & UMAC_IRQ_LINK_UP)); } /* bcmgenet_isr1: handle Rx and Tx priority queues */ @@ -2483,22 +2556,21 @@ static irqreturn_t bcmgenet_isr1(int irq, void *dev_id) struct bcmgenet_priv *priv = dev_id; struct bcmgenet_rx_ring *rx_ring; struct bcmgenet_tx_ring *tx_ring; - unsigned int index; + unsigned int index, status; - /* Save irq status for bottom-half processing. */ - priv->irq1_stat = - bcmgenet_intrl2_1_readl(priv, INTRL2_CPU_STAT) & + /* Read irq status */ + status = bcmgenet_intrl2_1_readl(priv, INTRL2_CPU_STAT) & ~bcmgenet_intrl2_1_readl(priv, INTRL2_CPU_MASK_STATUS); /* clear interrupts */ - bcmgenet_intrl2_1_writel(priv, priv->irq1_stat, INTRL2_CPU_CLEAR); + bcmgenet_intrl2_1_writel(priv, status, INTRL2_CPU_CLEAR); netif_dbg(priv, intr, priv->dev, - "%s: IRQ=0x%x\n", __func__, priv->irq1_stat); + "%s: IRQ=0x%x\n", __func__, status); /* Check Rx priority queue interrupts */ for (index = 0; index < priv->hw_params->rx_queues; index++) { - if (!(priv->irq1_stat & BIT(UMAC_IRQ1_RX_INTR_SHIFT + index))) + if (!(status & BIT(UMAC_IRQ1_RX_INTR_SHIFT + index))) continue; rx_ring = &priv->rx_rings[index]; @@ -2511,7 +2583,7 @@ static irqreturn_t bcmgenet_isr1(int irq, void *dev_id) /* Check Tx priority queue interrupts */ for (index = 0; index < priv->hw_params->tx_queues; index++) { - if (!(priv->irq1_stat & BIT(index))) + if (!(status & BIT(index))) continue; tx_ring = &priv->tx_rings[index]; @@ -2531,19 +2603,20 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id) struct bcmgenet_priv *priv = dev_id; struct bcmgenet_rx_ring *rx_ring; struct bcmgenet_tx_ring *tx_ring; + unsigned int status; + unsigned long flags; - /* Save irq status for bottom-half processing. */ - priv->irq0_stat = - bcmgenet_intrl2_0_readl(priv, INTRL2_CPU_STAT) & + /* Read irq status */ + status = bcmgenet_intrl2_0_readl(priv, INTRL2_CPU_STAT) & ~bcmgenet_intrl2_0_readl(priv, INTRL2_CPU_MASK_STATUS); /* clear interrupts */ - bcmgenet_intrl2_0_writel(priv, priv->irq0_stat, INTRL2_CPU_CLEAR); + bcmgenet_intrl2_0_writel(priv, status, INTRL2_CPU_CLEAR); netif_dbg(priv, intr, priv->dev, - "IRQ=0x%x\n", priv->irq0_stat); + "IRQ=0x%x\n", status); - if (priv->irq0_stat & UMAC_IRQ_RXDMA_DONE) { + if (status & UMAC_IRQ_RXDMA_DONE) { rx_ring = &priv->rx_rings[DESC_INDEX]; if (likely(napi_schedule_prep(&rx_ring->napi))) { @@ -2552,7 +2625,7 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id) } } - if (priv->irq0_stat & UMAC_IRQ_TXDMA_DONE) { + if (status & UMAC_IRQ_TXDMA_DONE) { tx_ring = &priv->tx_rings[DESC_INDEX]; if (likely(napi_schedule_prep(&tx_ring->napi))) { @@ -2561,22 +2634,23 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id) } } - if (priv->irq0_stat & (UMAC_IRQ_PHY_DET_R | - UMAC_IRQ_PHY_DET_F | - UMAC_IRQ_LINK_EVENT | - UMAC_IRQ_HFB_SM | - UMAC_IRQ_HFB_MM | - UMAC_IRQ_MPD_R)) { - /* all other interested interrupts handled in bottom half */ - schedule_work(&priv->bcmgenet_irq_work); - } - if ((priv->hw_params->flags & GENET_HAS_MDIO_INTR) && - priv->irq0_stat & (UMAC_IRQ_MDIO_DONE | UMAC_IRQ_MDIO_ERROR)) { - priv->irq0_stat &= ~(UMAC_IRQ_MDIO_DONE | UMAC_IRQ_MDIO_ERROR); + status & (UMAC_IRQ_MDIO_DONE | UMAC_IRQ_MDIO_ERROR)) { wake_up(&priv->wq); } + /* all other interested interrupts handled in bottom half */ + status &= (UMAC_IRQ_LINK_EVENT | + UMAC_IRQ_MPD_R); + if (status) { + /* Save irq status for bottom-half processing. */ + spin_lock_irqsave(&priv->lock, flags); + priv->irq0_stat |= status; + spin_unlock_irqrestore(&priv->lock, flags); + + schedule_work(&priv->bcmgenet_irq_work); + } + return IRQ_HANDLED; } @@ -2801,6 +2875,8 @@ err_irq0: err_fini_dma: bcmgenet_fini_dma(priv); err_clk_disable: + if (priv->internal_phy) + bcmgenet_power_down(priv, GENET_POWER_PASSIVE); clk_disable_unprepare(priv->clk); return ret; } @@ -3177,6 +3253,12 @@ static void bcmgenet_set_hw_params(struct bcmgenet_priv *priv) */ gphy_rev = reg & 0xffff; + /* This is reserved so should require special treatment */ + if (gphy_rev == 0 || gphy_rev == 0x01ff) { + pr_warn("Invalid GPHY revision detected: 0x%04x\n", gphy_rev); + return; + } + /* This is the good old scheme, just GPHY major, no minor nor patch */ if ((gphy_rev & 0xf0) != 0) priv->gphy_rev = gphy_rev << 8; @@ -3185,12 +3267,6 @@ static void bcmgenet_set_hw_params(struct bcmgenet_priv *priv) else if ((gphy_rev & 0xff00) != 0) priv->gphy_rev = gphy_rev; - /* This is reserved so should require special treatment */ - else if (gphy_rev == 0 || gphy_rev == 0x01ff) { - pr_warn("Invalid GPHY revision detected: 0x%04x\n", gphy_rev); - return; - } - #ifdef CONFIG_PHYS_ADDR_T_64BIT if (!(params->flags & GENET_HAS_40BITS)) pr_warn("GENET does not support 40-bits PA\n"); @@ -3233,6 +3309,7 @@ static int bcmgenet_probe(struct platform_device *pdev) const void *macaddr; struct resource *r; int err = -EIO; + const char *phy_mode_str; /* Up to GENET_MAX_MQ_CNT + 1 TX queues and RX queues */ dev = alloc_etherdev_mqs(sizeof(*priv), GENET_MAX_MQ_CNT + 1, @@ -3276,6 +3353,8 @@ static int bcmgenet_probe(struct platform_device *pdev) goto err; } + spin_lock_init(&priv->lock); + SET_NETDEV_DEV(dev, &pdev->dev); dev_set_drvdata(&pdev->dev, dev); ether_addr_copy(dev->dev_addr, macaddr); @@ -3338,6 +3417,13 @@ static int bcmgenet_probe(struct platform_device *pdev) priv->clk_eee = NULL; } + /* If this is an internal GPHY, power it on now, before UniMAC is + * brought out of reset as absolutely no UniMAC activity is allowed + */ + if (dn && !of_property_read_string(dn, "phy-mode", &phy_mode_str) && + !strcasecmp(phy_mode_str, "internal")) + bcmgenet_power_up(priv, GENET_POWER_PASSIVE); + err = reset_umac(priv); if (err) goto err_clk_disable; @@ -3502,6 +3588,8 @@ static int bcmgenet_resume(struct device *d) return 0; out_clk_disable: + if (priv->internal_phy) + bcmgenet_power_down(priv, GENET_POWER_PASSIVE); clk_disable_unprepare(priv->clk); return ret; } diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index 1e2dc34d331a..db7f289d65ae 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Broadcom Corporation + * Copyright (c) 2014-2017 Broadcom * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -214,7 +214,9 @@ struct bcmgenet_mib_counters { #define MDIO_REG_SHIFT 16 #define MDIO_REG_MASK 0x1F -#define UMAC_RBUF_OVFL_CNT 0x61C +#define UMAC_RBUF_OVFL_CNT_V1 0x61C +#define RBUF_OVFL_CNT_V2 0x80 +#define RBUF_OVFL_CNT_V3PLUS 0x94 #define UMAC_MPD_CTRL 0x620 #define MPD_EN (1 << 0) @@ -224,7 +226,9 @@ struct bcmgenet_mib_counters { #define UMAC_MPD_PW_MS 0x624 #define UMAC_MPD_PW_LS 0x628 -#define UMAC_RBUF_ERR_CNT 0x634 +#define UMAC_RBUF_ERR_CNT_V1 0x634 +#define RBUF_ERR_CNT_V2 0x84 +#define RBUF_ERR_CNT_V3PLUS 0x98 #define UMAC_MDF_ERR_CNT 0x638 #define UMAC_MDF_CTRL 0x650 #define UMAC_MDF_ADDR 0x654 @@ -619,11 +623,13 @@ struct bcmgenet_priv { struct work_struct bcmgenet_irq_work; int irq0; int irq1; - unsigned int irq0_stat; - unsigned int irq1_stat; int wol_irq; bool wol_irq_disabled; + /* shared status */ + spinlock_t lock; + unsigned int irq0_stat; + /* HW descriptors/checksum variables */ bool desc_64b_en; bool desc_rxchk_en; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index be9c0e3f5ade..92f46b1375c3 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -152,7 +152,7 @@ struct octnic_gather { */ struct octeon_sg_entry *sg; - u64 sg_dma_ptr; + dma_addr_t sg_dma_ptr; }; struct handshake { @@ -734,6 +734,9 @@ static void delete_glists(struct lio *lio) struct octnic_gather *g; int i; + kfree(lio->glist_lock); + lio->glist_lock = NULL; + if (!lio->glist) return; @@ -741,23 +744,26 @@ static void delete_glists(struct lio *lio) do { g = (struct octnic_gather *) list_delete_head(&lio->glist[i]); - if (g) { - if (g->sg) { - dma_unmap_single(&lio->oct_dev-> - pci_dev->dev, - g->sg_dma_ptr, - g->sg_size, - DMA_TO_DEVICE); - kfree((void *)((unsigned long)g->sg - - g->adjust)); - } + if (g) kfree(g); - } } while (g); + + if (lio->glists_virt_base && lio->glists_virt_base[i]) { + lio_dma_free(lio->oct_dev, + lio->glist_entry_size * lio->tx_qsize, + lio->glists_virt_base[i], + lio->glists_dma_base[i]); + } } - kfree((void *)lio->glist); - kfree((void *)lio->glist_lock); + kfree(lio->glists_virt_base); + lio->glists_virt_base = NULL; + + kfree(lio->glists_dma_base); + lio->glists_dma_base = NULL; + + kfree(lio->glist); + lio->glist = NULL; } /** @@ -772,13 +778,30 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs) lio->glist_lock = kcalloc(num_iqs, sizeof(*lio->glist_lock), GFP_KERNEL); if (!lio->glist_lock) - return 1; + return -ENOMEM; lio->glist = kcalloc(num_iqs, sizeof(*lio->glist), GFP_KERNEL); if (!lio->glist) { - kfree((void *)lio->glist_lock); - return 1; + kfree(lio->glist_lock); + lio->glist_lock = NULL; + return -ENOMEM; + } + + lio->glist_entry_size = + ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE); + + /* allocate memory to store virtual and dma base address of + * per glist consistent memory + */ + lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base), + GFP_KERNEL); + lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base), + GFP_KERNEL); + + if (!lio->glists_virt_base || !lio->glists_dma_base) { + delete_glists(lio); + return -ENOMEM; } for (i = 0; i < num_iqs; i++) { @@ -788,6 +811,16 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs) INIT_LIST_HEAD(&lio->glist[i]); + lio->glists_virt_base[i] = + lio_dma_alloc(oct, + lio->glist_entry_size * lio->tx_qsize, + &lio->glists_dma_base[i]); + + if (!lio->glists_virt_base[i]) { + delete_glists(lio); + return -ENOMEM; + } + for (j = 0; j < lio->tx_qsize; j++) { g = kzalloc_node(sizeof(*g), GFP_KERNEL, numa_node); @@ -796,43 +829,18 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs) if (!g) break; - g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * - OCT_SG_ENTRY_SIZE); + g->sg = lio->glists_virt_base[i] + + (j * lio->glist_entry_size); - g->sg = kmalloc_node(g->sg_size + 8, - GFP_KERNEL, numa_node); - if (!g->sg) - g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL); - if (!g->sg) { - kfree(g); - break; - } - - /* The gather component should be aligned on 64-bit - * boundary - */ - if (((unsigned long)g->sg) & 7) { - g->adjust = 8 - (((unsigned long)g->sg) & 7); - g->sg = (struct octeon_sg_entry *) - ((unsigned long)g->sg + g->adjust); - } - g->sg_dma_ptr = dma_map_single(&oct->pci_dev->dev, - g->sg, g->sg_size, - DMA_TO_DEVICE); - if (dma_mapping_error(&oct->pci_dev->dev, - g->sg_dma_ptr)) { - kfree((void *)((unsigned long)g->sg - - g->adjust)); - kfree(g); - break; - } + g->sg_dma_ptr = lio->glists_dma_base[i] + + (j * lio->glist_entry_size); list_add_tail(&g->list, &lio->glist[i]); } if (j != lio->tx_qsize) { delete_glists(lio); - return 1; + return -ENOMEM; } } @@ -1885,9 +1893,6 @@ static void free_netsgbuf(void *buf) i++; } - dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev, - g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE); - iq = skb_iq(lio, skb); spin_lock(&lio->glist_lock[iq]); list_add_tail(&g->list, &lio->glist[iq]); @@ -1933,9 +1938,6 @@ static void free_netsgbuf_with_resp(void *buf) i++; } - dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev, - g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE); - iq = skb_iq(lio, skb); spin_lock(&lio->glist_lock[iq]); @@ -3273,8 +3275,6 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) i++; } - dma_sync_single_for_device(&oct->pci_dev->dev, g->sg_dma_ptr, - g->sg_size, DMA_TO_DEVICE); dptr = g->sg_dma_ptr; if (OCTEON_CN23XX_PF(oct)) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c index 9d5e03502c76..7b83be4ce1fe 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c @@ -108,6 +108,8 @@ struct octnic_gather { * received from the IP layer. */ struct octeon_sg_entry *sg; + + dma_addr_t sg_dma_ptr; }; struct octeon_device_priv { @@ -490,6 +492,9 @@ static void delete_glists(struct lio *lio) struct octnic_gather *g; int i; + kfree(lio->glist_lock); + lio->glist_lock = NULL; + if (!lio->glist) return; @@ -497,17 +502,26 @@ static void delete_glists(struct lio *lio) do { g = (struct octnic_gather *) list_delete_head(&lio->glist[i]); - if (g) { - if (g->sg) - kfree((void *)((unsigned long)g->sg - - g->adjust)); + if (g) kfree(g); - } } while (g); + + if (lio->glists_virt_base && lio->glists_virt_base[i]) { + lio_dma_free(lio->oct_dev, + lio->glist_entry_size * lio->tx_qsize, + lio->glists_virt_base[i], + lio->glists_dma_base[i]); + } } + kfree(lio->glists_virt_base); + lio->glists_virt_base = NULL; + + kfree(lio->glists_dma_base); + lio->glists_dma_base = NULL; + kfree(lio->glist); - kfree(lio->glist_lock); + lio->glist = NULL; } /** @@ -522,13 +536,30 @@ static int setup_glists(struct lio *lio, int num_iqs) lio->glist_lock = kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL); if (!lio->glist_lock) - return 1; + return -ENOMEM; lio->glist = kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL); if (!lio->glist) { kfree(lio->glist_lock); - return 1; + lio->glist_lock = NULL; + return -ENOMEM; + } + + lio->glist_entry_size = + ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE); + + /* allocate memory to store virtual and dma base address of + * per glist consistent memory + */ + lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base), + GFP_KERNEL); + lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base), + GFP_KERNEL); + + if (!lio->glists_virt_base || !lio->glists_dma_base) { + delete_glists(lio); + return -ENOMEM; } for (i = 0; i < num_iqs; i++) { @@ -536,34 +567,33 @@ static int setup_glists(struct lio *lio, int num_iqs) INIT_LIST_HEAD(&lio->glist[i]); + lio->glists_virt_base[i] = + lio_dma_alloc(lio->oct_dev, + lio->glist_entry_size * lio->tx_qsize, + &lio->glists_dma_base[i]); + + if (!lio->glists_virt_base[i]) { + delete_glists(lio); + return -ENOMEM; + } + for (j = 0; j < lio->tx_qsize; j++) { g = kzalloc(sizeof(*g), GFP_KERNEL); if (!g) break; - g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * - OCT_SG_ENTRY_SIZE); + g->sg = lio->glists_virt_base[i] + + (j * lio->glist_entry_size); - g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL); - if (!g->sg) { - kfree(g); - break; - } + g->sg_dma_ptr = lio->glists_dma_base[i] + + (j * lio->glist_entry_size); - /* The gather component should be aligned on 64-bit - * boundary - */ - if (((unsigned long)g->sg) & 7) { - g->adjust = 8 - (((unsigned long)g->sg) & 7); - g->sg = (struct octeon_sg_entry *) - ((unsigned long)g->sg + g->adjust); - } list_add_tail(&g->list, &lio->glist[i]); } if (j != lio->tx_qsize) { delete_glists(lio); - return 1; + return -ENOMEM; } } @@ -1324,10 +1354,6 @@ static void free_netsgbuf(void *buf) i++; } - dma_unmap_single(&lio->oct_dev->pci_dev->dev, - finfo->dptr, g->sg_size, - DMA_TO_DEVICE); - iq = skb_iq(lio, skb); spin_lock(&lio->glist_lock[iq]); @@ -1374,10 +1400,6 @@ static void free_netsgbuf_with_resp(void *buf) i++; } - dma_unmap_single(&lio->oct_dev->pci_dev->dev, - finfo->dptr, g->sg_size, - DMA_TO_DEVICE); - iq = skb_iq(lio, skb); spin_lock(&lio->glist_lock[iq]); @@ -2382,23 +2404,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) i++; } - dptr = dma_map_single(&oct->pci_dev->dev, - g->sg, g->sg_size, - DMA_TO_DEVICE); - if (dma_mapping_error(&oct->pci_dev->dev, dptr)) { - dev_err(&oct->pci_dev->dev, "%s DMA mapping error 4\n", - __func__); - dma_unmap_single(&oct->pci_dev->dev, g->sg[0].ptr[0], - skb->len - skb->data_len, - DMA_TO_DEVICE); - for (j = 1; j <= frags; j++) { - frag = &skb_shinfo(skb)->frags[j - 1]; - dma_unmap_page(&oct->pci_dev->dev, - g->sg[j >> 2].ptr[j & 3], - frag->size, DMA_TO_DEVICE); - } - return NETDEV_TX_BUSY; - } + dptr = g->sg_dma_ptr; ndata.cmd.cmd3.dptr = dptr; finfo->dptr = dptr; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_config.h b/drivers/net/ethernet/cavium/liquidio/octeon_config.h index b3dc2e9651a8..d29ebc531151 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_config.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_config.h @@ -71,17 +71,17 @@ #define CN23XX_MAX_RINGS_PER_VF 8 #define CN23XX_MAX_INPUT_QUEUES CN23XX_MAX_RINGS_PER_PF -#define CN23XX_MAX_IQ_DESCRIPTORS 2048 +#define CN23XX_MAX_IQ_DESCRIPTORS 512 #define CN23XX_DB_MIN 1 #define CN23XX_DB_MAX 8 #define CN23XX_DB_TIMEOUT 1 #define CN23XX_MAX_OUTPUT_QUEUES CN23XX_MAX_RINGS_PER_PF -#define CN23XX_MAX_OQ_DESCRIPTORS 2048 +#define CN23XX_MAX_OQ_DESCRIPTORS 512 #define CN23XX_OQ_BUF_SIZE 1536 #define CN23XX_OQ_PKTSPER_INTR 128 /*#define CAVIUM_ONLY_CN23XX_RX_PERF*/ -#define CN23XX_OQ_REFIL_THRESHOLD 128 +#define CN23XX_OQ_REFIL_THRESHOLD 16 #define CN23XX_OQ_INTR_PKT 64 #define CN23XX_OQ_INTR_TIME 100 diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c index 0be87d119a97..79f809479af6 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c @@ -155,11 +155,6 @@ octeon_droq_destroy_ring_buffers(struct octeon_device *oct, recv_buffer_destroy(droq->recv_buf_list[i].buffer, pg_info); - if (droq->desc_ring && droq->desc_ring[i].info_ptr) - lio_unmap_ring_info(oct->pci_dev, - (u64)droq-> - desc_ring[i].info_ptr, - OCT_DROQ_INFO_SIZE); droq->recv_buf_list[i].buffer = NULL; } @@ -211,10 +206,7 @@ int octeon_delete_droq(struct octeon_device *oct, u32 q_no) vfree(droq->recv_buf_list); if (droq->info_base_addr) - cnnic_free_aligned_dma(oct->pci_dev, droq->info_list, - droq->info_alloc_size, - droq->info_base_addr, - droq->info_list_dma); + lio_free_info_buffer(oct, droq); if (droq->desc_ring) lio_dma_free(oct, (droq->max_count * OCT_DROQ_DESC_SIZE), @@ -294,12 +286,7 @@ int octeon_init_droq(struct octeon_device *oct, dev_dbg(&oct->pci_dev->dev, "droq[%d]: num_desc: %d\n", q_no, droq->max_count); - droq->info_list = - cnnic_numa_alloc_aligned_dma((droq->max_count * - OCT_DROQ_INFO_SIZE), - &droq->info_alloc_size, - &droq->info_base_addr, - numa_node); + droq->info_list = lio_alloc_info_buffer(oct, droq); if (!droq->info_list) { dev_err(&oct->pci_dev->dev, "Cannot allocate memory for info list.\n"); lio_dma_free(oct, (droq->max_count * OCT_DROQ_DESC_SIZE), diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h index e62074090681..6982c0af5ecc 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h @@ -325,10 +325,10 @@ struct octeon_droq { size_t desc_ring_dma; /** Info ptr list are allocated at this virtual address. */ - size_t info_base_addr; + void *info_base_addr; /** DMA mapped address of the info list */ - size_t info_list_dma; + dma_addr_t info_list_dma; /** Allocated size of info list. */ u32 info_alloc_size; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h index aa36e9ae7676..bed9ef17bc26 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h @@ -140,48 +140,6 @@ err_release_region: return 1; } -static inline void * -cnnic_numa_alloc_aligned_dma(u32 size, - u32 *alloc_size, - size_t *orig_ptr, - int numa_node) -{ - int retries = 0; - void *ptr = NULL; - -#define OCTEON_MAX_ALLOC_RETRIES 1 - do { - struct page *page = NULL; - - page = alloc_pages_node(numa_node, - GFP_KERNEL, - get_order(size)); - if (!page) - page = alloc_pages(GFP_KERNEL, - get_order(size)); - ptr = (void *)page_address(page); - if ((unsigned long)ptr & 0x07) { - __free_pages(page, get_order(size)); - ptr = NULL; - /* Increment the size required if the first - * attempt failed. - */ - if (!retries) - size += 7; - } - retries++; - } while ((retries <= OCTEON_MAX_ALLOC_RETRIES) && !ptr); - - *alloc_size = size; - *orig_ptr = (unsigned long)ptr; - if ((unsigned long)ptr & 0x07) - ptr = (void *)(((unsigned long)ptr + 7) & ~(7UL)); - return ptr; -} - -#define cnnic_free_aligned_dma(pci_dev, ptr, size, orig_ptr, dma_addr) \ - free_pages(orig_ptr, get_order(size)) - static inline int sleep_cond(wait_queue_head_t *wait_queue, int *condition) { diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h index 6bb89419006e..eef2a1e8a7e3 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h @@ -62,6 +62,9 @@ struct lio { /** Array of gather component linked lists */ struct list_head *glist; + void **glists_virt_base; + dma_addr_t *glists_dma_base; + u32 glist_entry_size; /** Pointer to the NIC properties for the Octeon device this network * interface is associated with. @@ -344,6 +347,29 @@ static inline void tx_buffer_free(void *buffer) #define lio_dma_free(oct, size, virt_addr, dma_addr) \ dma_free_coherent(&(oct)->pci_dev->dev, size, virt_addr, dma_addr) +static inline void * +lio_alloc_info_buffer(struct octeon_device *oct, + struct octeon_droq *droq) +{ + void *virt_ptr; + + virt_ptr = lio_dma_alloc(oct, (droq->max_count * OCT_DROQ_INFO_SIZE), + &droq->info_list_dma); + if (virt_ptr) { + droq->info_alloc_size = droq->max_count * OCT_DROQ_INFO_SIZE; + droq->info_base_addr = virt_ptr; + } + + return virt_ptr; +} + +static inline void lio_free_info_buffer(struct octeon_device *oct, + struct octeon_droq *droq) +{ + lio_dma_free(oct, droq->info_alloc_size, droq->info_base_addr, + droq->info_list_dma); +} + static inline void *get_rbd(struct sk_buff *skb) { @@ -359,22 +385,7 @@ void *get_rbd(struct sk_buff *skb) static inline u64 lio_map_ring_info(struct octeon_droq *droq, u32 i) { - dma_addr_t dma_addr; - struct octeon_device *oct = droq->oct_dev; - - dma_addr = dma_map_single(&oct->pci_dev->dev, &droq->info_list[i], - OCT_DROQ_INFO_SIZE, DMA_FROM_DEVICE); - - WARN_ON(dma_mapping_error(&oct->pci_dev->dev, dma_addr)); - - return (u64)dma_addr; -} - -static inline void -lio_unmap_ring_info(struct pci_dev *pci_dev, - u64 info_ptr, u32 size) -{ - dma_unmap_single(&pci_dev->dev, info_ptr, size, DMA_FROM_DEVICE); + return droq->info_list_dma + (i * sizeof(struct octeon_droq_info)); } static inline u64 diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h index e739c7153562..2269ff562d95 100644 --- a/drivers/net/ethernet/cavium/thunder/nic.h +++ b/drivers/net/ethernet/cavium/thunder/nic.h @@ -269,6 +269,7 @@ struct nicvf { #define MAX_QUEUES_PER_QSET 8 struct queue_set *qs; struct nicvf_cq_poll *napi[8]; + void *iommu_domain; u8 vf_id; u8 sqs_id; bool sqs_mode; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 6feaa24bcfd4..24017588f531 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -16,6 +16,7 @@ #include <linux/log2.h> #include <linux/prefetch.h> #include <linux/irq.h> +#include <linux/iommu.h> #include "nic_reg.h" #include "nic.h" @@ -525,7 +526,12 @@ static void nicvf_snd_pkt_handler(struct net_device *netdev, /* Get actual TSO descriptors and free them */ tso_sqe = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, hdr->rsvd2); + nicvf_unmap_sndq_buffers(nic, sq, hdr->rsvd2, + tso_sqe->subdesc_cnt); nicvf_put_sq_desc(sq, tso_sqe->subdesc_cnt + 1); + } else { + nicvf_unmap_sndq_buffers(nic, sq, cqe_tx->sqe_ptr, + hdr->subdesc_cnt); } nicvf_put_sq_desc(sq, hdr->subdesc_cnt + 1); prefetch(skb); @@ -576,6 +582,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev, { struct sk_buff *skb; struct nicvf *nic = netdev_priv(netdev); + struct nicvf *snic = nic; int err = 0; int rq_idx; @@ -592,7 +599,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev, if (err && !cqe_rx->rb_cnt) return; - skb = nicvf_get_rcv_skb(nic, cqe_rx); + skb = nicvf_get_rcv_skb(snic, cqe_rx); if (!skb) { netdev_dbg(nic->netdev, "Packet not received\n"); return; @@ -1643,6 +1650,9 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!pass1_silicon(nic->pdev)) nic->hw_tso = true; + /* Get iommu domain for iova to physical addr conversion */ + nic->iommu_domain = iommu_get_domain_for_dev(dev); + pci_read_config_word(nic->pdev, PCI_SUBSYSTEM_ID, &sdevid); if (sdevid == 0xA134) nic->t88 = true; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index ac0390be3b12..f13289f0d238 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -10,6 +10,7 @@ #include <linux/netdevice.h> #include <linux/ip.h> #include <linux/etherdevice.h> +#include <linux/iommu.h> #include <net/ip.h> #include <net/tso.h> @@ -18,6 +19,16 @@ #include "q_struct.h" #include "nicvf_queues.h" +#define NICVF_PAGE_ORDER ((PAGE_SIZE <= 4096) ? PAGE_ALLOC_COSTLY_ORDER : 0) + +static inline u64 nicvf_iova_to_phys(struct nicvf *nic, dma_addr_t dma_addr) +{ + /* Translation is installed only when IOMMU is present */ + if (nic->iommu_domain) + return iommu_iova_to_phys(nic->iommu_domain, dma_addr); + return dma_addr; +} + static void nicvf_get_page(struct nicvf *nic) { if (!nic->rb_pageref || !nic->rb_page) @@ -87,7 +98,7 @@ static void nicvf_free_q_desc_mem(struct nicvf *nic, struct q_desc_mem *dmem) static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp, u32 buf_len, u64 **rbuf) { - int order = (PAGE_SIZE <= 4096) ? PAGE_ALLOC_COSTLY_ORDER : 0; + int order = NICVF_PAGE_ORDER; /* Check if request can be accomodated in previous allocated page */ if (nic->rb_page && @@ -97,22 +108,27 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, gfp_t gfp, } nicvf_get_page(nic); - nic->rb_page = NULL; /* Allocate a new page */ + nic->rb_page = alloc_pages(gfp | __GFP_COMP | __GFP_NOWARN, + order); if (!nic->rb_page) { - nic->rb_page = alloc_pages(gfp | __GFP_COMP | __GFP_NOWARN, - order); - if (!nic->rb_page) { - this_cpu_inc(nic->pnicvf->drv_stats-> - rcv_buffer_alloc_failures); - return -ENOMEM; - } - nic->rb_page_offset = 0; + this_cpu_inc(nic->pnicvf->drv_stats->rcv_buffer_alloc_failures); + return -ENOMEM; } - + nic->rb_page_offset = 0; ret: - *rbuf = (u64 *)((u64)page_address(nic->rb_page) + nic->rb_page_offset); + /* HW will ensure data coherency, CPU sync not required */ + *rbuf = (u64 *)((u64)dma_map_page_attrs(&nic->pdev->dev, nic->rb_page, + nic->rb_page_offset, buf_len, + DMA_FROM_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC)); + if (dma_mapping_error(&nic->pdev->dev, (dma_addr_t)*rbuf)) { + if (!nic->rb_page_offset) + __free_pages(nic->rb_page, order); + nic->rb_page = NULL; + return -ENOMEM; + } nic->rb_page_offset += buf_len; return 0; @@ -158,16 +174,21 @@ static int nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr, rbdr->dma_size = buf_size; rbdr->enable = true; rbdr->thresh = RBDR_THRESH; + rbdr->head = 0; + rbdr->tail = 0; nic->rb_page = NULL; for (idx = 0; idx < ring_len; idx++) { err = nicvf_alloc_rcv_buffer(nic, GFP_KERNEL, RCV_FRAG_LEN, &rbuf); - if (err) + if (err) { + /* To free already allocated and mapped ones */ + rbdr->tail = idx - 1; return err; + } desc = GET_RBDR_DESC(rbdr, idx); - desc->buf_addr = virt_to_phys(rbuf) >> NICVF_RCV_BUF_ALIGN; + desc->buf_addr = (u64)rbuf >> NICVF_RCV_BUF_ALIGN; } nicvf_get_page(nic); @@ -179,7 +200,7 @@ static int nicvf_init_rbdr(struct nicvf *nic, struct rbdr *rbdr, static void nicvf_free_rbdr(struct nicvf *nic, struct rbdr *rbdr) { int head, tail; - u64 buf_addr; + u64 buf_addr, phys_addr; struct rbdr_entry_t *desc; if (!rbdr) @@ -192,18 +213,26 @@ static void nicvf_free_rbdr(struct nicvf *nic, struct rbdr *rbdr) head = rbdr->head; tail = rbdr->tail; - /* Free SKBs */ + /* Release page references */ while (head != tail) { desc = GET_RBDR_DESC(rbdr, head); - buf_addr = desc->buf_addr << NICVF_RCV_BUF_ALIGN; - put_page(virt_to_page(phys_to_virt(buf_addr))); + buf_addr = ((u64)desc->buf_addr) << NICVF_RCV_BUF_ALIGN; + phys_addr = nicvf_iova_to_phys(nic, buf_addr); + dma_unmap_page_attrs(&nic->pdev->dev, buf_addr, RCV_FRAG_LEN, + DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); + if (phys_addr) + put_page(virt_to_page(phys_to_virt(phys_addr))); head++; head &= (rbdr->dmem.q_len - 1); } - /* Free SKB of tail desc */ + /* Release buffer of tail desc */ desc = GET_RBDR_DESC(rbdr, tail); - buf_addr = desc->buf_addr << NICVF_RCV_BUF_ALIGN; - put_page(virt_to_page(phys_to_virt(buf_addr))); + buf_addr = ((u64)desc->buf_addr) << NICVF_RCV_BUF_ALIGN; + phys_addr = nicvf_iova_to_phys(nic, buf_addr); + dma_unmap_page_attrs(&nic->pdev->dev, buf_addr, RCV_FRAG_LEN, + DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); + if (phys_addr) + put_page(virt_to_page(phys_to_virt(phys_addr))); /* Free RBDR ring */ nicvf_free_q_desc_mem(nic, &rbdr->dmem); @@ -250,7 +279,7 @@ refill: break; desc = GET_RBDR_DESC(rbdr, tail); - desc->buf_addr = virt_to_phys(rbuf) >> NICVF_RCV_BUF_ALIGN; + desc->buf_addr = (u64)rbuf >> NICVF_RCV_BUF_ALIGN; refill_rb_cnt--; new_rb++; } @@ -361,9 +390,29 @@ static int nicvf_init_snd_queue(struct nicvf *nic, return 0; } +void nicvf_unmap_sndq_buffers(struct nicvf *nic, struct snd_queue *sq, + int hdr_sqe, u8 subdesc_cnt) +{ + u8 idx; + struct sq_gather_subdesc *gather; + + /* Unmap DMA mapped skb data buffers */ + for (idx = 0; idx < subdesc_cnt; idx++) { + hdr_sqe++; + hdr_sqe &= (sq->dmem.q_len - 1); + gather = (struct sq_gather_subdesc *)GET_SQ_DESC(sq, hdr_sqe); + /* HW will ensure data coherency, CPU sync not required */ + dma_unmap_page_attrs(&nic->pdev->dev, gather->addr, + gather->size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + } +} + static void nicvf_free_snd_queue(struct nicvf *nic, struct snd_queue *sq) { struct sk_buff *skb; + struct sq_hdr_subdesc *hdr; + struct sq_hdr_subdesc *tso_sqe; if (!sq) return; @@ -379,8 +428,22 @@ static void nicvf_free_snd_queue(struct nicvf *nic, struct snd_queue *sq) smp_rmb(); while (sq->head != sq->tail) { skb = (struct sk_buff *)sq->skbuff[sq->head]; - if (skb) - dev_kfree_skb_any(skb); + if (!skb) + goto next; + hdr = (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, sq->head); + /* Check for dummy descriptor used for HW TSO offload on 88xx */ + if (hdr->dont_send) { + /* Get actual TSO descriptors and unmap them */ + tso_sqe = + (struct sq_hdr_subdesc *)GET_SQ_DESC(sq, hdr->rsvd2); + nicvf_unmap_sndq_buffers(nic, sq, hdr->rsvd2, + tso_sqe->subdesc_cnt); + } else { + nicvf_unmap_sndq_buffers(nic, sq, sq->head, + hdr->subdesc_cnt); + } + dev_kfree_skb_any(skb); +next: sq->head++; sq->head &= (sq->dmem.q_len - 1); } @@ -559,9 +622,11 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs, nicvf_send_msg_to_pf(nic, &mbx); if (!nic->sqs_mode && (qidx == 0)) { - /* Enable checking L3/L4 length and TCP/UDP checksums */ + /* Enable checking L3/L4 length and TCP/UDP checksums + * Also allow IPv6 pkts with zero UDP checksum. + */ nicvf_queue_reg_write(nic, NIC_QSET_RQ_GEN_CFG, 0, - (BIT(24) | BIT(23) | BIT(21))); + (BIT(24) | BIT(23) | BIT(21) | BIT(20))); nicvf_config_vlan_stripping(nic, nic->netdev->features); } @@ -882,6 +947,14 @@ static inline int nicvf_get_sq_desc(struct snd_queue *sq, int desc_cnt) return qentry; } +/* Rollback to previous tail pointer when descriptors not used */ +static inline void nicvf_rollback_sq_desc(struct snd_queue *sq, + int qentry, int desc_cnt) +{ + sq->tail = qentry; + atomic_add(desc_cnt, &sq->free_cnt); +} + /* Free descriptor back to SQ for future use */ void nicvf_put_sq_desc(struct snd_queue *sq, int desc_cnt) { @@ -1207,8 +1280,9 @@ int nicvf_sq_append_skb(struct nicvf *nic, struct snd_queue *sq, struct sk_buff *skb, u8 sq_num) { int i, size; - int subdesc_cnt, tso_sqe = 0; + int subdesc_cnt, hdr_sqe = 0; int qentry; + u64 dma_addr; subdesc_cnt = nicvf_sq_subdesc_required(nic, skb); if (subdesc_cnt > atomic_read(&sq->free_cnt)) @@ -1223,12 +1297,21 @@ int nicvf_sq_append_skb(struct nicvf *nic, struct snd_queue *sq, /* Add SQ header subdesc */ nicvf_sq_add_hdr_subdesc(nic, sq, qentry, subdesc_cnt - 1, skb, skb->len); - tso_sqe = qentry; + hdr_sqe = qentry; /* Add SQ gather subdescs */ qentry = nicvf_get_nxt_sqentry(sq, qentry); size = skb_is_nonlinear(skb) ? skb_headlen(skb) : skb->len; - nicvf_sq_add_gather_subdesc(sq, qentry, size, virt_to_phys(skb->data)); + /* HW will ensure data coherency, CPU sync not required */ + dma_addr = dma_map_page_attrs(&nic->pdev->dev, virt_to_page(skb->data), + offset_in_page(skb->data), size, + DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); + if (dma_mapping_error(&nic->pdev->dev, dma_addr)) { + nicvf_rollback_sq_desc(sq, qentry, subdesc_cnt); + return 0; + } + + nicvf_sq_add_gather_subdesc(sq, qentry, size, dma_addr); /* Check for scattered buffer */ if (!skb_is_nonlinear(skb)) @@ -1241,15 +1324,26 @@ int nicvf_sq_append_skb(struct nicvf *nic, struct snd_queue *sq, qentry = nicvf_get_nxt_sqentry(sq, qentry); size = skb_frag_size(frag); - nicvf_sq_add_gather_subdesc(sq, qentry, size, - virt_to_phys( - skb_frag_address(frag))); + dma_addr = dma_map_page_attrs(&nic->pdev->dev, + skb_frag_page(frag), + frag->page_offset, size, + DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + if (dma_mapping_error(&nic->pdev->dev, dma_addr)) { + /* Free entire chain of mapped buffers + * here 'i' = frags mapped + above mapped skb->data + */ + nicvf_unmap_sndq_buffers(nic, sq, hdr_sqe, i); + nicvf_rollback_sq_desc(sq, qentry, subdesc_cnt); + return 0; + } + nicvf_sq_add_gather_subdesc(sq, qentry, size, dma_addr); } doorbell: if (nic->t88 && skb_shinfo(skb)->gso_size) { qentry = nicvf_get_nxt_sqentry(sq, qentry); - nicvf_sq_add_cqe_subdesc(sq, qentry, tso_sqe, skb); + nicvf_sq_add_cqe_subdesc(sq, qentry, hdr_sqe, skb); } nicvf_sq_doorbell(nic, skb, sq_num, subdesc_cnt); @@ -1282,6 +1376,7 @@ struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic, struct cqe_rx_t *cqe_rx) int offset; u16 *rb_lens = NULL; u64 *rb_ptrs = NULL; + u64 phys_addr; rb_lens = (void *)cqe_rx + (3 * sizeof(u64)); /* Except 88xx pass1 on all other chips CQE_RX2_S is added to @@ -1296,15 +1391,23 @@ struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic, struct cqe_rx_t *cqe_rx) else rb_ptrs = (void *)cqe_rx + (7 * sizeof(u64)); - netdev_dbg(nic->netdev, "%s rb_cnt %d rb0_ptr %llx rb0_sz %d\n", - __func__, cqe_rx->rb_cnt, cqe_rx->rb0_ptr, cqe_rx->rb0_sz); - for (frag = 0; frag < cqe_rx->rb_cnt; frag++) { payload_len = rb_lens[frag_num(frag)]; + phys_addr = nicvf_iova_to_phys(nic, *rb_ptrs); + if (!phys_addr) { + if (skb) + dev_kfree_skb_any(skb); + return NULL; + } + if (!frag) { /* First fragment */ + dma_unmap_page_attrs(&nic->pdev->dev, + *rb_ptrs - cqe_rx->align_pad, + RCV_FRAG_LEN, DMA_FROM_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); skb = nicvf_rb_ptr_to_skb(nic, - *rb_ptrs - cqe_rx->align_pad, + phys_addr - cqe_rx->align_pad, payload_len); if (!skb) return NULL; @@ -1312,8 +1415,11 @@ struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic, struct cqe_rx_t *cqe_rx) skb_put(skb, payload_len); } else { /* Add fragments */ - page = virt_to_page(phys_to_virt(*rb_ptrs)); - offset = phys_to_virt(*rb_ptrs) - page_address(page); + dma_unmap_page_attrs(&nic->pdev->dev, *rb_ptrs, + RCV_FRAG_LEN, DMA_FROM_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + page = virt_to_page(phys_to_virt(phys_addr)); + offset = phys_to_virt(phys_addr) - page_address(page); skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset, payload_len, RCV_FRAG_LEN); } diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h index 5cb84da99a2d..10cb4b84625b 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h @@ -87,7 +87,7 @@ #define RCV_BUF_COUNT (1ULL << (RBDR_SIZE + 13)) #define MAX_RCV_BUF_COUNT (1ULL << (RBDR_SIZE6 + 13)) #define RBDR_THRESH (RCV_BUF_COUNT / 2) -#define DMA_BUFFER_LEN 2048 /* In multiples of 128bytes */ +#define DMA_BUFFER_LEN 1536 /* In multiples of 128bytes */ #define RCV_FRAG_LEN (SKB_DATA_ALIGN(DMA_BUFFER_LEN + NET_SKB_PAD) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) @@ -301,6 +301,8 @@ struct queue_set { #define CQ_ERR_MASK (CQ_WR_FULL | CQ_WR_DISABLE | CQ_WR_FAULT) +void nicvf_unmap_sndq_buffers(struct nicvf *nic, struct snd_queue *sq, + int hdr_sqe, u8 subdesc_cnt); void nicvf_config_vlan_stripping(struct nicvf *nic, netdev_features_t features); int nicvf_set_qset_resources(struct nicvf *nic); diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index 4c8e8cf730bb..64a1095e4d14 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -123,14 +123,44 @@ static int bgx_poll_reg(struct bgx *bgx, u8 lmac, u64 reg, u64 mask, bool zero) return 1; } +static int max_bgx_per_node; +static void set_max_bgx_per_node(struct pci_dev *pdev) +{ + u16 sdevid; + + if (max_bgx_per_node) + return; + + pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &sdevid); + switch (sdevid) { + case PCI_SUBSYS_DEVID_81XX_BGX: + max_bgx_per_node = MAX_BGX_PER_CN81XX; + break; + case PCI_SUBSYS_DEVID_83XX_BGX: + max_bgx_per_node = MAX_BGX_PER_CN83XX; + break; + case PCI_SUBSYS_DEVID_88XX_BGX: + default: + max_bgx_per_node = MAX_BGX_PER_CN88XX; + break; + } +} + +static struct bgx *get_bgx(int node, int bgx_idx) +{ + int idx = (node * max_bgx_per_node) + bgx_idx; + + return bgx_vnic[idx]; +} + /* Return number of BGX present in HW */ unsigned bgx_get_map(int node) { int i; unsigned map = 0; - for (i = 0; i < MAX_BGX_PER_NODE; i++) { - if (bgx_vnic[(node * MAX_BGX_PER_NODE) + i]) + for (i = 0; i < max_bgx_per_node; i++) { + if (bgx_vnic[(node * max_bgx_per_node) + i]) map |= (1 << i); } @@ -143,7 +173,7 @@ int bgx_get_lmac_count(int node, int bgx_idx) { struct bgx *bgx; - bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx]; + bgx = get_bgx(node, bgx_idx); if (bgx) return bgx->lmac_count; @@ -158,7 +188,7 @@ void bgx_get_lmac_link_state(int node, int bgx_idx, int lmacid, void *status) struct bgx *bgx; struct lmac *lmac; - bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx]; + bgx = get_bgx(node, bgx_idx); if (!bgx) return; @@ -172,7 +202,7 @@ EXPORT_SYMBOL(bgx_get_lmac_link_state); const u8 *bgx_get_lmac_mac(int node, int bgx_idx, int lmacid) { - struct bgx *bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx]; + struct bgx *bgx = get_bgx(node, bgx_idx); if (bgx) return bgx->lmac[lmacid].mac; @@ -183,7 +213,7 @@ EXPORT_SYMBOL(bgx_get_lmac_mac); void bgx_set_lmac_mac(int node, int bgx_idx, int lmacid, const u8 *mac) { - struct bgx *bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx]; + struct bgx *bgx = get_bgx(node, bgx_idx); if (!bgx) return; @@ -194,7 +224,7 @@ EXPORT_SYMBOL(bgx_set_lmac_mac); void bgx_lmac_rx_tx_enable(int node, int bgx_idx, int lmacid, bool enable) { - struct bgx *bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx]; + struct bgx *bgx = get_bgx(node, bgx_idx); struct lmac *lmac; u64 cfg; @@ -217,7 +247,7 @@ EXPORT_SYMBOL(bgx_lmac_rx_tx_enable); void bgx_lmac_get_pfc(int node, int bgx_idx, int lmacid, void *pause) { struct pfc *pfc = (struct pfc *)pause; - struct bgx *bgx = bgx_vnic[(node * MAX_BGX_PER_CN88XX) + bgx_idx]; + struct bgx *bgx = get_bgx(node, bgx_idx); struct lmac *lmac; u64 cfg; @@ -237,7 +267,7 @@ EXPORT_SYMBOL(bgx_lmac_get_pfc); void bgx_lmac_set_pfc(int node, int bgx_idx, int lmacid, void *pause) { struct pfc *pfc = (struct pfc *)pause; - struct bgx *bgx = bgx_vnic[(node * MAX_BGX_PER_CN88XX) + bgx_idx]; + struct bgx *bgx = get_bgx(node, bgx_idx); struct lmac *lmac; u64 cfg; @@ -369,7 +399,7 @@ u64 bgx_get_rx_stats(int node, int bgx_idx, int lmac, int idx) { struct bgx *bgx; - bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx]; + bgx = get_bgx(node, bgx_idx); if (!bgx) return 0; @@ -383,7 +413,7 @@ u64 bgx_get_tx_stats(int node, int bgx_idx, int lmac, int idx) { struct bgx *bgx; - bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx]; + bgx = get_bgx(node, bgx_idx); if (!bgx) return 0; @@ -411,7 +441,7 @@ void bgx_lmac_internal_loopback(int node, int bgx_idx, struct lmac *lmac; u64 cfg; - bgx = bgx_vnic[(node * MAX_BGX_PER_NODE) + bgx_idx]; + bgx = get_bgx(node, bgx_idx); if (!bgx) return; @@ -1011,12 +1041,6 @@ static void bgx_print_qlm_mode(struct bgx *bgx, u8 lmacid) dev_info(dev, "%s: 40G_KR4\n", (char *)str); break; case BGX_MODE_QSGMII: - if ((lmacid == 0) && - (bgx_get_lane2sds_cfg(bgx, lmac) != lmacid)) - return; - if ((lmacid == 2) && - (bgx_get_lane2sds_cfg(bgx, lmac) == lmacid)) - return; dev_info(dev, "%s: QSGMII\n", (char *)str); break; case BGX_MODE_RGMII: @@ -1334,11 +1358,13 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_release_regions; } + set_max_bgx_per_node(pdev); + pci_read_config_word(pdev, PCI_DEVICE_ID, &sdevid); if (sdevid != PCI_DEVICE_ID_THUNDER_RGX) { bgx->bgx_id = (pci_resource_start(pdev, PCI_CFG_REG_BAR_NUM) >> 24) & BGX_ID_MASK; - bgx->bgx_id += nic_get_node_id(pdev) * MAX_BGX_PER_NODE; + bgx->bgx_id += nic_get_node_id(pdev) * max_bgx_per_node; bgx->max_lmac = MAX_LMAC_PER_BGX; bgx_vnic[bgx->bgx_id] = bgx; } else { diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h index a60f189429bb..c5080f2cead5 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h @@ -22,7 +22,6 @@ #define MAX_BGX_PER_CN88XX 2 #define MAX_BGX_PER_CN81XX 3 /* 2 BGXs + 1 RGX */ #define MAX_BGX_PER_CN83XX 4 -#define MAX_BGX_PER_NODE 4 #define MAX_LMAC_PER_BGX 4 #define MAX_BGX_CHANS_PER_LMAC 16 #define MAX_DMAC_PER_LMAC 8 diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 275c2e2349ad..c44036d5761a 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -2589,8 +2589,6 @@ static int emac_dt_mdio_probe(struct emac_instance *dev) static int emac_dt_phy_connect(struct emac_instance *dev, struct device_node *phy_handle) { - int res; - dev->phy.def = devm_kzalloc(&dev->ofdev->dev, sizeof(*dev->phy.def), GFP_KERNEL); if (!dev->phy.def) @@ -2617,7 +2615,7 @@ static int emac_dt_phy_probe(struct emac_instance *dev) { struct device_node *np = dev->ofdev->dev.of_node; struct device_node *phy_handle; - int res = 0; + int res = 1; phy_handle = of_parse_phandle(np, "phy-handle", 0); @@ -2714,13 +2712,24 @@ static int emac_init_phy(struct emac_instance *dev) if (emac_has_feature(dev, EMAC_FTR_HAS_RGMII)) { int res = emac_dt_phy_probe(dev); - mutex_unlock(&emac_phy_map_lock); - if (!res) + switch (res) { + case 1: + /* No phy-handle property configured. + * Continue with the existing phy probe + * and setup code. + */ + break; + + case 0: + mutex_unlock(&emac_phy_map_lock); goto init_phy; - dev_err(&dev->ofdev->dev, "failed to attach dt phy (%d).\n", - res); - return res; + default: + mutex_unlock(&emac_phy_map_lock); + dev_err(&dev->ofdev->dev, "failed to attach dt phy (%d).\n", + res); + return res; + } } if (dev->phy_address != 0xffffffff) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 9198e6bd5160..5f11b4dc95d2 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -404,7 +404,7 @@ static int ibmvnic_open(struct net_device *netdev) send_map_query(adapter); for (i = 0; i < rxadd_subcrqs; i++) { init_rx_pool(adapter, &adapter->rx_pool[i], - IBMVNIC_BUFFS_PER_POOL, i, + adapter->req_rx_add_entries_per_subcrq, i, be64_to_cpu(size_array[i]), 1); if (alloc_rx_pool(adapter, &adapter->rx_pool[i])) { dev_err(dev, "Couldn't alloc rx pool\n"); @@ -419,23 +419,23 @@ static int ibmvnic_open(struct net_device *netdev) for (i = 0; i < tx_subcrqs; i++) { tx_pool = &adapter->tx_pool[i]; tx_pool->tx_buff = - kcalloc(adapter->max_tx_entries_per_subcrq, + kcalloc(adapter->req_tx_entries_per_subcrq, sizeof(struct ibmvnic_tx_buff), GFP_KERNEL); if (!tx_pool->tx_buff) goto tx_pool_alloc_failed; if (alloc_long_term_buff(adapter, &tx_pool->long_term_buff, - adapter->max_tx_entries_per_subcrq * + adapter->req_tx_entries_per_subcrq * adapter->req_mtu)) goto tx_ltb_alloc_failed; tx_pool->free_map = - kcalloc(adapter->max_tx_entries_per_subcrq, + kcalloc(adapter->req_tx_entries_per_subcrq, sizeof(int), GFP_KERNEL); if (!tx_pool->free_map) goto tx_fm_alloc_failed; - for (j = 0; j < adapter->max_tx_entries_per_subcrq; j++) + for (j = 0; j < adapter->req_tx_entries_per_subcrq; j++) tx_pool->free_map[j] = j; tx_pool->consumer_index = 0; @@ -705,6 +705,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) u8 *hdrs = (u8 *)&adapter->tx_rx_desc_req; struct device *dev = &adapter->vdev->dev; struct ibmvnic_tx_buff *tx_buff = NULL; + struct ibmvnic_sub_crq_queue *tx_scrq; struct ibmvnic_tx_pool *tx_pool; unsigned int tx_send_failed = 0; unsigned int tx_map_failed = 0; @@ -724,6 +725,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) int ret = 0; tx_pool = &adapter->tx_pool[queue_num]; + tx_scrq = adapter->tx_scrq[queue_num]; txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb)); handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + be32_to_cpu(adapter->login_rsp_buf-> @@ -744,7 +746,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_pool->consumer_index = (tx_pool->consumer_index + 1) % - adapter->max_tx_entries_per_subcrq; + adapter->req_tx_entries_per_subcrq; tx_buff = &tx_pool->tx_buff[index]; tx_buff->skb = skb; @@ -817,7 +819,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) if (tx_pool->consumer_index == 0) tx_pool->consumer_index = - adapter->max_tx_entries_per_subcrq - 1; + adapter->req_tx_entries_per_subcrq - 1; else tx_pool->consumer_index--; @@ -826,6 +828,14 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) ret = NETDEV_TX_BUSY; goto out; } + + atomic_inc(&tx_scrq->used); + + if (atomic_read(&tx_scrq->used) >= adapter->req_tx_entries_per_subcrq) { + netdev_info(netdev, "Stopping queue %d\n", queue_num); + netif_stop_subqueue(netdev, queue_num); + } + tx_packets++; tx_bytes += skb->len; txq->trans_start = jiffies; @@ -1213,6 +1223,7 @@ static struct ibmvnic_sub_crq_queue *init_sub_crq_queue(struct ibmvnic_adapter scrq->adapter = adapter; scrq->size = 4 * PAGE_SIZE / sizeof(*scrq->msgs); scrq->cur = 0; + atomic_set(&scrq->used, 0); scrq->rx_skb_top = NULL; spin_lock_init(&scrq->lock); @@ -1355,14 +1366,28 @@ restart_loop: DMA_TO_DEVICE); } - if (txbuff->last_frag) + if (txbuff->last_frag) { + atomic_dec(&scrq->used); + + if (atomic_read(&scrq->used) <= + (adapter->req_tx_entries_per_subcrq / 2) && + netif_subqueue_stopped(adapter->netdev, + txbuff->skb)) { + netif_wake_subqueue(adapter->netdev, + scrq->pool_index); + netdev_dbg(adapter->netdev, + "Started queue %d\n", + scrq->pool_index); + } + dev_kfree_skb_any(txbuff->skb); + } adapter->tx_pool[pool].free_map[adapter->tx_pool[pool]. producer_index] = index; adapter->tx_pool[pool].producer_index = (adapter->tx_pool[pool].producer_index + 1) % - adapter->max_tx_entries_per_subcrq; + adapter->req_tx_entries_per_subcrq; } /* remove tx_comp scrq*/ next->tx_comp.first = 0; diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 422824f1f42a..1993b42666f7 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -863,6 +863,7 @@ struct ibmvnic_sub_crq_queue { spinlock_t lock; struct sk_buff *rx_skb_top; struct ibmvnic_adapter *adapter; + atomic_t used; }; struct ibmvnic_long_term_buff { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index ddb4ca4ff930..117170014e88 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -14,6 +14,7 @@ config MLX5_CORE config MLX5_CORE_EN bool "Mellanox Technologies ConnectX-4 Ethernet support" depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE + depends on IPV6=y || IPV6=n || MLX5_CORE=m imply PTP_1588_CLOCK default n ---help--- diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 0523ed47f597..8fa23f6a1f67 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -302,6 +302,9 @@ static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode) struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5e_dcbx *dcbx = &priv->dcbx; + if (mode & DCB_CAP_DCBX_LLD_MANAGED) + return 1; + if ((!mode) && MLX5_CAP_GEN(priv->mdev, dcbx)) { if (dcbx->mode == MLX5E_DCBX_PARAM_VER_OPER_AUTO) return 0; @@ -315,13 +318,10 @@ static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode) return 1; } - if (mlx5e_dcbnl_switch_to_host_mode(netdev_priv(dev))) + if (!(mode & DCB_CAP_DCBX_HOST)) return 1; - if ((mode & DCB_CAP_DCBX_LLD_MANAGED) || - !(mode & DCB_CAP_DCBX_VER_CEE) || - !(mode & DCB_CAP_DCBX_VER_IEEE) || - !(mode & DCB_CAP_DCBX_HOST)) + if (mlx5e_dcbnl_switch_to_host_mode(netdev_priv(dev))) return 1; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 31e3cb7ee5fe..5621dcfda4f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -204,9 +204,6 @@ mlx5e_test_loopback_validate(struct sk_buff *skb, struct iphdr *iph; /* We are only going to peek, no need to clone the SKB */ - if (skb->protocol != htons(ETH_P_IP)) - goto out; - if (MLX5E_TEST_PKT_SIZE - ETH_HLEN > skb_headlen(skb)) goto out; @@ -249,7 +246,7 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv, lbtp->loopback_ok = false; init_completion(&lbtp->comp); - lbtp->pt.type = htons(ETH_P_ALL); + lbtp->pt.type = htons(ETH_P_IP); lbtp->pt.func = mlx5e_test_loopback_validate; lbtp->pt.dev = priv->netdev; lbtp->pt.af_packet_priv = lbtp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 44406a5ec15d..79481f4cf264 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -48,9 +48,14 @@ #include "eswitch.h" #include "vxlan.h" +enum { + MLX5E_TC_FLOW_ESWITCH = BIT(0), +}; + struct mlx5e_tc_flow { struct rhash_head node; u64 cookie; + u8 flags; struct mlx5_flow_handle *rule; struct list_head encap; /* flows sharing the same encap */ struct mlx5_esw_flow_attr *attr; @@ -177,7 +182,7 @@ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, mlx5_fc_destroy(priv->mdev, counter); } - if (esw && esw->mode == SRIOV_OFFLOADS) { + if (flow->flags & MLX5E_TC_FLOW_ESWITCH) { mlx5_eswitch_del_vlan_action(esw, flow->attr); if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) mlx5e_detach_encap(priv, flow); @@ -598,6 +603,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, } static int parse_cls_flower(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, struct mlx5_flow_spec *spec, struct tc_cls_flower_offload *f) { @@ -609,7 +615,7 @@ static int parse_cls_flower(struct mlx5e_priv *priv, err = __parse_cls_flower(priv, spec, f, &min_inline); - if (!err && esw->mode == SRIOV_OFFLOADS && + if (!err && (flow->flags & MLX5E_TC_FLOW_ESWITCH) && rep->vport != FDB_UPLINK_VPORT) { if (min_inline > esw->offloads.inline_mode) { netdev_warn(priv->netdev, @@ -1132,23 +1138,19 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol, struct tc_cls_flower_offload *f) { struct mlx5e_tc_table *tc = &priv->fs.tc; - int err = 0; - bool fdb_flow = false; + int err, attr_size = 0; u32 flow_tag, action; struct mlx5e_tc_flow *flow; struct mlx5_flow_spec *spec; struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + u8 flow_flags = 0; - if (esw && esw->mode == SRIOV_OFFLOADS) - fdb_flow = true; - - if (fdb_flow) - flow = kzalloc(sizeof(*flow) + - sizeof(struct mlx5_esw_flow_attr), - GFP_KERNEL); - else - flow = kzalloc(sizeof(*flow), GFP_KERNEL); + if (esw && esw->mode == SRIOV_OFFLOADS) { + flow_flags = MLX5E_TC_FLOW_ESWITCH; + attr_size = sizeof(struct mlx5_esw_flow_attr); + } + flow = kzalloc(sizeof(*flow) + attr_size, GFP_KERNEL); spec = mlx5_vzalloc(sizeof(*spec)); if (!spec || !flow) { err = -ENOMEM; @@ -1156,12 +1158,13 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol, } flow->cookie = f->cookie; + flow->flags = flow_flags; - err = parse_cls_flower(priv, spec, f); + err = parse_cls_flower(priv, flow, spec, f); if (err < 0) goto err_free; - if (fdb_flow) { + if (flow->flags & MLX5E_TC_FLOW_ESWITCH) { flow->attr = (struct mlx5_esw_flow_attr *)(flow + 1); err = parse_tc_fdb_actions(priv, f->exts, flow); if (err < 0) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2478516a61e2..ded27bb9a3b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1136,7 +1136,7 @@ static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft, u32 *match_criteria) { int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); - struct list_head *prev = ft->node.children.prev; + struct list_head *prev = &ft->node.children; unsigned int candidate_index = 0; struct mlx5_flow_group *fg; void *match_criteria_addr; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c4242a4e8130..e2bd600d19de 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1352,6 +1352,7 @@ static int init_one(struct pci_dev *pdev, if (err) goto clean_load; + pci_save_state(pdev); return 0; clean_load: @@ -1407,9 +1408,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv, false); - /* In case of kernel call save the pci state and drain the health wq */ + /* In case of kernel call drain the health wq */ if (state) { - pci_save_state(pdev); mlx5_drain_health_wq(dev); mlx5_pci_disable_device(dev); } @@ -1461,6 +1461,7 @@ static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev) pci_set_master(pdev); pci_restore_state(pdev); + pci_save_state(pdev); if (wait_vital(pdev)) { dev_err(&pdev->dev, "%s: wait_vital timed out\n", __func__); diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 0899e2d310e2..d9616daf8a70 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -769,7 +769,7 @@ static inline void mlxsw_reg_spvid_pack(char *payload, u8 local_port, u16 pvid) #define MLXSW_REG_SPVM_ID 0x200F #define MLXSW_REG_SPVM_BASE_LEN 0x04 /* base length, without records */ #define MLXSW_REG_SPVM_REC_LEN 0x04 /* record length */ -#define MLXSW_REG_SPVM_REC_MAX_COUNT 256 +#define MLXSW_REG_SPVM_REC_MAX_COUNT 255 #define MLXSW_REG_SPVM_LEN (MLXSW_REG_SPVM_BASE_LEN + \ MLXSW_REG_SPVM_REC_LEN * MLXSW_REG_SPVM_REC_MAX_COUNT) @@ -1702,7 +1702,7 @@ static inline void mlxsw_reg_sfmr_pack(char *payload, #define MLXSW_REG_SPVMLR_ID 0x2020 #define MLXSW_REG_SPVMLR_BASE_LEN 0x04 /* base length, without records */ #define MLXSW_REG_SPVMLR_REC_LEN 0x04 /* record length */ -#define MLXSW_REG_SPVMLR_REC_MAX_COUNT 256 +#define MLXSW_REG_SPVMLR_REC_MAX_COUNT 255 #define MLXSW_REG_SPVMLR_LEN (MLXSW_REG_SPVMLR_BASE_LEN + \ MLXSW_REG_SPVMLR_REC_LEN * \ MLXSW_REG_SPVMLR_REC_MAX_COUNT) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 22ab42925377..ae6cccc666e4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -303,11 +303,11 @@ void mlxsw_sp_flower_destroy(struct mlxsw_sp_port *mlxsw_sp_port, bool ingress, ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, mlxsw_sp_port->dev, ingress, MLXSW_SP_ACL_PROFILE_FLOWER); - if (WARN_ON(IS_ERR(ruleset))) + if (IS_ERR(ruleset)) return; rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie); - if (!WARN_ON(!rule)) { + if (rule) { mlxsw_sp_acl_rule_del(mlxsw_sp, rule); mlxsw_sp_acl_rule_destroy(mlxsw_sp, rule); } diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index d42d03df751a..7e3a6fed3da6 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -422,8 +422,9 @@ static void qed_cxt_set_proto_cid_count(struct qed_hwfn *p_hwfn, u32 page_sz = p_mgr->clients[ILT_CLI_CDUC].p_size.val; u32 cxt_size = CONN_CXT_SIZE(p_hwfn); u32 elems_per_page = ILT_PAGE_IN_BYTES(page_sz) / cxt_size; + u32 align = elems_per_page * DQ_RANGE_ALIGN; - p_conn->cid_count = roundup(p_conn->cid_count, elems_per_page); + p_conn->cid_count = roundup(p_conn->cid_count, align); } } diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index e2a081ceaf52..e518f914eab1 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -2389,9 +2389,8 @@ qed_chain_alloc_sanity_check(struct qed_dev *cdev, * size/capacity fields are of a u32 type. */ if ((cnt_type == QED_CHAIN_CNT_TYPE_U16 && - chain_size > 0x10000) || - (cnt_type == QED_CHAIN_CNT_TYPE_U32 && - chain_size > 0x100000000ULL)) { + chain_size > ((u32)U16_MAX + 1)) || + (cnt_type == QED_CHAIN_CNT_TYPE_U32 && chain_size > U32_MAX)) { DP_NOTICE(cdev, "The actual chain size (0x%llx) is larger than the maximal possible value\n", chain_size); diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c index 3a44d6b395fa..098766f7fe88 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c @@ -190,6 +190,9 @@ qed_sp_iscsi_func_start(struct qed_hwfn *p_hwfn, p_init->num_sq_pages_in_ring = p_params->num_sq_pages_in_ring; p_init->num_r2tq_pages_in_ring = p_params->num_r2tq_pages_in_ring; p_init->num_uhq_pages_in_ring = p_params->num_uhq_pages_in_ring; + p_init->ooo_enable = p_params->ooo_enable; + p_init->ll2_rx_queue_id = p_hwfn->hw_info.resc_start[QED_LL2_QUEUE] + + p_params->ll2_ooo_queue_id; p_init->func_params.log_page_size = p_params->log_page_size; val = p_params->num_tasks; p_init->func_params.num_tasks = cpu_to_le16(val); @@ -786,6 +789,23 @@ static void qed_iscsi_release_connection(struct qed_hwfn *p_hwfn, spin_unlock_bh(&p_hwfn->p_iscsi_info->lock); } +void qed_iscsi_free_connection(struct qed_hwfn *p_hwfn, + struct qed_iscsi_conn *p_conn) +{ + qed_chain_free(p_hwfn->cdev, &p_conn->xhq); + qed_chain_free(p_hwfn->cdev, &p_conn->uhq); + qed_chain_free(p_hwfn->cdev, &p_conn->r2tq); + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct tcp_upload_params), + p_conn->tcp_upload_params_virt_addr, + p_conn->tcp_upload_params_phys_addr); + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + sizeof(struct scsi_terminate_extra_params), + p_conn->queue_cnts_virt_addr, + p_conn->queue_cnts_phys_addr); + kfree(p_conn); +} + struct qed_iscsi_info *qed_iscsi_alloc(struct qed_hwfn *p_hwfn) { struct qed_iscsi_info *p_iscsi_info; @@ -807,6 +827,17 @@ void qed_iscsi_setup(struct qed_hwfn *p_hwfn, void qed_iscsi_free(struct qed_hwfn *p_hwfn, struct qed_iscsi_info *p_iscsi_info) { + struct qed_iscsi_conn *p_conn = NULL; + + while (!list_empty(&p_hwfn->p_iscsi_info->free_list)) { + p_conn = list_first_entry(&p_hwfn->p_iscsi_info->free_list, + struct qed_iscsi_conn, list_entry); + if (p_conn) { + list_del(&p_conn->list_entry); + qed_iscsi_free_connection(p_hwfn, p_conn); + } + } + kfree(p_iscsi_info); } diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 9a0b9af10a57..0d3cef409c96 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -211,6 +211,8 @@ static void qed_ll2b_complete_rx_packet(struct qed_hwfn *p_hwfn, /* If need to reuse or there's no replacement buffer, repost this */ if (rc) goto out_post; + dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr, + cdev->ll2->rx_size, DMA_FROM_DEVICE); skb = build_skb(buffer->data, 0); if (!skb) { @@ -474,7 +476,7 @@ qed_ll2_rxq_completion_gsi(struct qed_hwfn *p_hwfn, static int qed_ll2_rxq_completion_reg(struct qed_hwfn *p_hwfn, struct qed_ll2_info *p_ll2_conn, union core_rx_cqe_union *p_cqe, - unsigned long lock_flags, + unsigned long *p_lock_flags, bool b_last_cqe) { struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue; @@ -495,10 +497,10 @@ static int qed_ll2_rxq_completion_reg(struct qed_hwfn *p_hwfn, "Mismatch between active_descq and the LL2 Rx chain\n"); list_add_tail(&p_pkt->list_entry, &p_rx->free_descq); - spin_unlock_irqrestore(&p_rx->lock, lock_flags); + spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags); qed_ll2b_complete_rx_packet(p_hwfn, p_ll2_conn->my_id, p_pkt, &p_cqe->rx_cqe_fp, b_last_cqe); - spin_lock_irqsave(&p_rx->lock, lock_flags); + spin_lock_irqsave(&p_rx->lock, *p_lock_flags); return 0; } @@ -538,7 +540,8 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie) break; case CORE_RX_CQE_TYPE_REGULAR: rc = qed_ll2_rxq_completion_reg(p_hwfn, p_ll2_conn, - cqe, flags, b_last_cqe); + cqe, &flags, + b_last_cqe); break; default: rc = -EIO; @@ -968,7 +971,7 @@ static int qed_ll2_start_ooo(struct qed_dev *cdev, { struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev); u8 *handle = &hwfn->pf_params.iscsi_pf_params.ll2_ooo_queue_id; - struct qed_ll2_conn ll2_info; + struct qed_ll2_conn ll2_info = { 0 }; int rc; ll2_info.conn_type = QED_LL2_TYPE_ISCSI_OOO; diff --git a/drivers/net/ethernet/qlogic/qed/qed_ooo.c b/drivers/net/ethernet/qlogic/qed/qed_ooo.c index 7d731c6cb892..378afce58b3f 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ooo.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ooo.c @@ -159,6 +159,8 @@ struct qed_ooo_info *qed_ooo_alloc(struct qed_hwfn *p_hwfn) if (!p_ooo_info->ooo_history.p_cqes) goto no_history_mem; + p_ooo_info->ooo_history.num_of_cqes = QED_MAX_NUM_OOO_HISTORY_ENTRIES; + return p_ooo_info; no_history_mem: diff --git a/drivers/net/ethernet/smsc/smc91x.c b/drivers/net/ethernet/smsc/smc91x.c index 65077c77082a..91e9bd7159ab 100644 --- a/drivers/net/ethernet/smsc/smc91x.c +++ b/drivers/net/ethernet/smsc/smc91x.c @@ -1535,32 +1535,33 @@ static int smc_close(struct net_device *dev) * Ethtool support */ static int -smc_ethtool_getsettings(struct net_device *dev, struct ethtool_cmd *cmd) +smc_ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) { struct smc_local *lp = netdev_priv(dev); int ret; - cmd->maxtxpkt = 1; - cmd->maxrxpkt = 1; - if (lp->phy_type != 0) { spin_lock_irq(&lp->lock); - ret = mii_ethtool_gset(&lp->mii, cmd); + ret = mii_ethtool_get_link_ksettings(&lp->mii, cmd); spin_unlock_irq(&lp->lock); } else { - cmd->supported = SUPPORTED_10baseT_Half | + u32 supported = SUPPORTED_10baseT_Half | SUPPORTED_10baseT_Full | SUPPORTED_TP | SUPPORTED_AUI; if (lp->ctl_rspeed == 10) - ethtool_cmd_speed_set(cmd, SPEED_10); + cmd->base.speed = SPEED_10; else if (lp->ctl_rspeed == 100) - ethtool_cmd_speed_set(cmd, SPEED_100); + cmd->base.speed = SPEED_100; + + cmd->base.autoneg = AUTONEG_DISABLE; + cmd->base.port = 0; + cmd->base.duplex = lp->tcr_cur_mode & TCR_SWFDUP ? + DUPLEX_FULL : DUPLEX_HALF; - cmd->autoneg = AUTONEG_DISABLE; - cmd->transceiver = XCVR_INTERNAL; - cmd->port = 0; - cmd->duplex = lp->tcr_cur_mode & TCR_SWFDUP ? DUPLEX_FULL : DUPLEX_HALF; + ethtool_convert_legacy_u32_to_link_mode( + cmd->link_modes.supported, supported); ret = 0; } @@ -1569,24 +1570,26 @@ smc_ethtool_getsettings(struct net_device *dev, struct ethtool_cmd *cmd) } static int -smc_ethtool_setsettings(struct net_device *dev, struct ethtool_cmd *cmd) +smc_ethtool_set_link_ksettings(struct net_device *dev, + const struct ethtool_link_ksettings *cmd) { struct smc_local *lp = netdev_priv(dev); int ret; if (lp->phy_type != 0) { spin_lock_irq(&lp->lock); - ret = mii_ethtool_sset(&lp->mii, cmd); + ret = mii_ethtool_set_link_ksettings(&lp->mii, cmd); spin_unlock_irq(&lp->lock); } else { - if (cmd->autoneg != AUTONEG_DISABLE || - cmd->speed != SPEED_10 || - (cmd->duplex != DUPLEX_HALF && cmd->duplex != DUPLEX_FULL) || - (cmd->port != PORT_TP && cmd->port != PORT_AUI)) + if (cmd->base.autoneg != AUTONEG_DISABLE || + cmd->base.speed != SPEED_10 || + (cmd->base.duplex != DUPLEX_HALF && + cmd->base.duplex != DUPLEX_FULL) || + (cmd->base.port != PORT_TP && cmd->base.port != PORT_AUI)) return -EINVAL; -// lp->port = cmd->port; - lp->ctl_rfduplx = cmd->duplex == DUPLEX_FULL; +// lp->port = cmd->base.port; + lp->ctl_rfduplx = cmd->base.duplex == DUPLEX_FULL; // if (netif_running(dev)) // smc_set_port(dev); @@ -1744,8 +1747,6 @@ static int smc_ethtool_seteeprom(struct net_device *dev, static const struct ethtool_ops smc_ethtool_ops = { - .get_settings = smc_ethtool_getsettings, - .set_settings = smc_ethtool_setsettings, .get_drvinfo = smc_ethtool_getdrvinfo, .get_msglevel = smc_ethtool_getmsglevel, @@ -1755,6 +1756,8 @@ static const struct ethtool_ops smc_ethtool_ops = { .get_eeprom_len = smc_ethtool_geteeprom_len, .get_eeprom = smc_ethtool_geteeprom, .set_eeprom = smc_ethtool_seteeprom, + .get_link_ksettings = smc_ethtool_get_link_ksettings, + .set_link_ksettings = smc_ethtool_set_link_ksettings, }; static const struct net_device_ops smc_netdev_ops = { diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index d3e73ac158ae..f9f3dba7a588 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -700,6 +700,8 @@ struct net_device_context { u32 tx_checksum_mask; + u32 tx_send_table[VRSS_SEND_TAB_SIZE]; + /* Ethtool settings */ u8 duplex; u32 speed; @@ -757,7 +759,6 @@ struct netvsc_device { struct nvsp_message revoke_packet; - u32 send_table[VRSS_SEND_TAB_SIZE]; u32 max_chn; u32 num_chn; spinlock_t sc_lock; /* Protects num_sc_offered variable */ diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index d35ebd993b38..4c1d8cca247b 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1136,15 +1136,11 @@ static void netvsc_receive(struct net_device *ndev, static void netvsc_send_table(struct hv_device *hdev, struct nvsp_message *nvmsg) { - struct netvsc_device *nvscdev; struct net_device *ndev = hv_get_drvdata(hdev); + struct net_device_context *net_device_ctx = netdev_priv(ndev); int i; u32 count, *tab; - nvscdev = get_outbound_net_device(hdev); - if (!nvscdev) - return; - count = nvmsg->msg.v5_msg.send_table.count; if (count != VRSS_SEND_TAB_SIZE) { netdev_err(ndev, "Received wrong send-table size:%u\n", count); @@ -1155,7 +1151,7 @@ static void netvsc_send_table(struct hv_device *hdev, nvmsg->msg.v5_msg.send_table.offset); for (i = 0; i < count; i++) - nvscdev->send_table[i] = tab[i]; + net_device_ctx->tx_send_table[i] = tab[i]; } static void netvsc_send_vf(struct net_device_context *net_device_ctx, diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index bc05c895d958..5ede87f30463 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -206,17 +206,15 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, void *accel_priv, select_queue_fallback_t fallback) { struct net_device_context *net_device_ctx = netdev_priv(ndev); - struct netvsc_device *nvsc_dev = net_device_ctx->nvdev; + unsigned int num_tx_queues = ndev->real_num_tx_queues; struct sock *sk = skb->sk; int q_idx = sk_tx_queue_get(sk); - if (q_idx < 0 || skb->ooo_okay || - q_idx >= ndev->real_num_tx_queues) { + if (q_idx < 0 || skb->ooo_okay || q_idx >= num_tx_queues) { u16 hash = __skb_tx_hash(ndev, skb, VRSS_SEND_TAB_SIZE); int new_idx; - new_idx = nvsc_dev->send_table[hash] - % nvsc_dev->num_chn; + new_idx = net_device_ctx->tx_send_table[hash] % num_tx_queues; if (q_idx != new_idx && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) @@ -225,9 +223,6 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, q_idx = new_idx; } - if (unlikely(!nvsc_dev->chan_table[q_idx].channel)) - q_idx = 0; - return q_idx; } diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index f9d0fa315a47..272b051a0199 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1883,17 +1883,6 @@ static int m88e1510_probe(struct phy_device *phydev) return m88e1510_hwmon_probe(phydev); } -static void marvell_remove(struct phy_device *phydev) -{ -#ifdef CONFIG_HWMON - - struct marvell_priv *priv = phydev->priv; - - if (priv && priv->hwmon_dev) - hwmon_device_unregister(priv->hwmon_dev); -#endif -} - static struct phy_driver marvell_drivers[] = { { .phy_id = MARVELL_PHY_ID_88E1101, @@ -1974,7 +1963,6 @@ static struct phy_driver marvell_drivers[] = { .features = PHY_GBIT_FEATURES, .flags = PHY_HAS_INTERRUPT, .probe = &m88e1121_probe, - .remove = &marvell_remove, .config_init = &m88e1121_config_init, .config_aneg = &m88e1121_config_aneg, .read_status = &marvell_read_status, @@ -2087,7 +2075,6 @@ static struct phy_driver marvell_drivers[] = { .features = PHY_GBIT_FEATURES | SUPPORTED_FIBRE, .flags = PHY_HAS_INTERRUPT, .probe = &m88e1510_probe, - .remove = &marvell_remove, .config_init = &m88e1510_config_init, .config_aneg = &m88e1510_config_aneg, .read_status = &marvell_read_status, @@ -2109,7 +2096,6 @@ static struct phy_driver marvell_drivers[] = { .features = PHY_GBIT_FEATURES, .flags = PHY_HAS_INTERRUPT, .probe = m88e1510_probe, - .remove = &marvell_remove, .config_init = &marvell_config_init, .config_aneg = &m88e1510_config_aneg, .read_status = &marvell_read_status, @@ -2127,7 +2113,6 @@ static struct phy_driver marvell_drivers[] = { .phy_id_mask = MARVELL_PHY_ID_MASK, .name = "Marvell 88E1545", .probe = m88e1510_probe, - .remove = &marvell_remove, .features = PHY_GBIT_FEATURES, .flags = PHY_HAS_INTERRUPT, .config_init = &marvell_config_init, diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index daec6555f3b1..5198ccfa347f 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1864,7 +1864,7 @@ static struct phy_driver genphy_driver[] = { .phy_id = 0xffffffff, .phy_id_mask = 0xffffffff, .name = "Generic PHY", - .soft_reset = genphy_soft_reset, + .soft_reset = genphy_no_soft_reset, .config_init = genphy_config_init, .features = PHY_GBIT_FEATURES | SUPPORTED_MII | SUPPORTED_AUI | SUPPORTED_FIBRE | diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c index 93ffedfa2994..1e2d4f1179da 100644 --- a/drivers/net/phy/spi_ks8995.c +++ b/drivers/net/phy/spi_ks8995.c @@ -491,13 +491,14 @@ static int ks8995_probe(struct spi_device *spi) if (err) return err; - ks->regs_attr.size = ks->chip->regs_size; memcpy(&ks->regs_attr, &ks8995_registers_attr, sizeof(ks->regs_attr)); + ks->regs_attr.size = ks->chip->regs_size; err = ks8995_reset(ks); if (err) return err; + sysfs_attr_init(&ks->regs_attr.attr); err = sysfs_create_bin_file(&spi->dev.kobj, &ks->regs_attr); if (err) { dev_err(&spi->dev, "unable to create sysfs file, err=%d\n", diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 4a24b5d15f5a..1b52520715ae 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2072,6 +2072,7 @@ static int team_dev_type_check_change(struct net_device *dev, static void team_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = ETH_MAX_MTU; dev->netdev_ops = &team_netdev_ops; dev->ethtool_ops = &team_ethtool_ops; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index dc1b1dd9157c..34cc3c590aa5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -822,7 +822,18 @@ static void tun_net_uninit(struct net_device *dev) /* Net device open. */ static int tun_net_open(struct net_device *dev) { + struct tun_struct *tun = netdev_priv(dev); + int i; + netif_tx_start_all_queues(dev); + + for (i = 0; i < tun->numqueues; i++) { + struct tun_file *tfile; + + tfile = rtnl_dereference(tun->tfiles[i]); + tfile->socket.sk->sk_write_space(tfile->socket.sk); + } + return 0; } @@ -1103,9 +1114,10 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) if (!skb_array_empty(&tfile->tx_array)) mask |= POLLIN | POLLRDNORM; - if (sock_writeable(sk) || - (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && - sock_writeable(sk))) + if (tun->dev->flags & IFF_UP && + (sock_writeable(sk) || + (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && + sock_writeable(sk)))) mask |= POLLOUT | POLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) @@ -2570,7 +2582,6 @@ static int __init tun_init(void) int ret = 0; pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); - pr_info("%s\n", DRV_COPYRIGHT); ret = rtnl_link_register(&tun_link_ops); if (ret) { diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 22379da63400..fea687f35b5a 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -340,6 +340,7 @@ static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { + int len = skb->len; netdev_tx_t ret = is_ip_tx_frame(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { @@ -347,7 +348,7 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) u64_stats_update_begin(&dstats->syncp); dstats->tx_pkts++; - dstats->tx_bytes += skb->len; + dstats->tx_bytes += len; u64_stats_update_end(&dstats->syncp); } else { this_cpu_inc(dev->dstats->tx_drps); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e375560cc74e..bdb6ae16d4a8 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2976,6 +2976,44 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, return 0; } +static int __vxlan_dev_create(struct net *net, struct net_device *dev, + struct vxlan_config *conf) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_dev *vxlan = netdev_priv(dev); + int err; + + err = vxlan_dev_configure(net, dev, conf, false); + if (err) + return err; + + dev->ethtool_ops = &vxlan_ethtool_ops; + + /* create an fdb entry for a valid default destination */ + if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { + err = vxlan_fdb_create(vxlan, all_zeros_mac, + &vxlan->default_dst.remote_ip, + NUD_REACHABLE | NUD_PERMANENT, + NLM_F_EXCL | NLM_F_CREATE, + vxlan->cfg.dst_port, + vxlan->default_dst.remote_vni, + vxlan->default_dst.remote_vni, + vxlan->default_dst.remote_ifindex, + NTF_SELF); + if (err) + return err; + } + + err = register_netdevice(dev); + if (err) { + vxlan_fdb_delete_default(vxlan, vxlan->default_dst.remote_vni); + return err; + } + + list_add(&vxlan->next, &vn->vxlan_list); + return 0; +} + static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink) @@ -3172,8 +3210,6 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], static int vxlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); - struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_config conf; int err; @@ -3181,36 +3217,7 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, if (err) return err; - err = vxlan_dev_configure(src_net, dev, &conf, false); - if (err) - return err; - - dev->ethtool_ops = &vxlan_ethtool_ops; - - /* create an fdb entry for a valid default destination */ - if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { - err = vxlan_fdb_create(vxlan, all_zeros_mac, - &vxlan->default_dst.remote_ip, - NUD_REACHABLE | NUD_PERMANENT, - NLM_F_EXCL | NLM_F_CREATE, - vxlan->cfg.dst_port, - vxlan->default_dst.remote_vni, - vxlan->default_dst.remote_vni, - vxlan->default_dst.remote_ifindex, - NTF_SELF); - if (err) - return err; - } - - err = register_netdevice(dev); - if (err) { - vxlan_fdb_delete_default(vxlan, vxlan->default_dst.remote_vni); - return err; - } - - list_add(&vxlan->next, &vn->vxlan_list); - - return 0; + return __vxlan_dev_create(src_net, dev, &conf); } static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], @@ -3440,7 +3447,7 @@ struct net_device *vxlan_dev_create(struct net *net, const char *name, if (IS_ERR(dev)) return dev; - err = vxlan_dev_configure(net, dev, conf, false); + err = __vxlan_dev_create(net, dev, conf); if (err < 0) { free_netdev(dev); return ERR_PTR(err); diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index a5045b5279d7..6742ae605660 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -381,8 +381,8 @@ static netdev_tx_t ucc_hdlc_tx(struct sk_buff *skb, struct net_device *dev) /* set bd status and length */ bd_status = (bd_status & T_W_S) | T_R_S | T_I_S | T_L_S | T_TC_S; - iowrite16be(bd_status, &bd->status); iowrite16be(skb->len, &bd->length); + iowrite16be(bd_status, &bd->status); /* Move to next BD in the ring */ if (!(bd_status & T_W_S)) @@ -457,7 +457,7 @@ static int hdlc_rx_done(struct ucc_hdlc_private *priv, int rx_work_limit) struct sk_buff *skb; hdlc_device *hdlc = dev_to_hdlc(dev); struct qe_bd *bd; - u32 bd_status; + u16 bd_status; u16 length, howmany = 0; u8 *bdbuffer; int i; diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c index e7f5910a6519..f8eb66ef2944 100644 --- a/drivers/net/wimax/i2400m/usb.c +++ b/drivers/net/wimax/i2400m/usb.c @@ -467,6 +467,9 @@ int i2400mu_probe(struct usb_interface *iface, struct i2400mu *i2400mu; struct usb_device *usb_dev = interface_to_usbdev(iface); + if (iface->cur_altsetting->desc.bNumEndpoints < 4) + return -ENODEV; + if (usb_dev->speed != USB_SPEED_HIGH) dev_err(dev, "device not connected as high speed\n"); diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 829b26cd4549..8397f6c92451 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -165,13 +165,17 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct xenvif *vif = netdev_priv(dev); struct xenvif_queue *queue = NULL; - unsigned int num_queues = vif->num_queues; + unsigned int num_queues; u16 index; struct xenvif_rx_cb *cb; BUG_ON(skb->dev != dev); - /* Drop the packet if queues are not set up */ + /* Drop the packet if queues are not set up. + * This handler should be called inside an RCU read section + * so we don't need to enter it here explicitly. + */ + num_queues = READ_ONCE(vif->num_queues); if (num_queues < 1) goto drop; @@ -222,18 +226,18 @@ static struct net_device_stats *xenvif_get_stats(struct net_device *dev) { struct xenvif *vif = netdev_priv(dev); struct xenvif_queue *queue = NULL; + unsigned int num_queues; u64 rx_bytes = 0; u64 rx_packets = 0; u64 tx_bytes = 0; u64 tx_packets = 0; unsigned int index; - spin_lock(&vif->lock); - if (vif->queues == NULL) - goto out; + rcu_read_lock(); + num_queues = READ_ONCE(vif->num_queues); /* Aggregate tx and rx stats from each queue */ - for (index = 0; index < vif->num_queues; ++index) { + for (index = 0; index < num_queues; ++index) { queue = &vif->queues[index]; rx_bytes += queue->stats.rx_bytes; rx_packets += queue->stats.rx_packets; @@ -241,8 +245,7 @@ static struct net_device_stats *xenvif_get_stats(struct net_device *dev) tx_packets += queue->stats.tx_packets; } -out: - spin_unlock(&vif->lock); + rcu_read_unlock(); vif->dev->stats.rx_bytes = rx_bytes; vif->dev->stats.rx_packets = rx_packets; @@ -378,10 +381,13 @@ static void xenvif_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 * data) { struct xenvif *vif = netdev_priv(dev); - unsigned int num_queues = vif->num_queues; + unsigned int num_queues; int i; unsigned int queue_index; + rcu_read_lock(); + num_queues = READ_ONCE(vif->num_queues); + for (i = 0; i < ARRAY_SIZE(xenvif_stats); i++) { unsigned long accum = 0; for (queue_index = 0; queue_index < num_queues; ++queue_index) { @@ -390,6 +396,8 @@ static void xenvif_get_ethtool_stats(struct net_device *dev, } data[i] = accum; } + + rcu_read_unlock(); } static void xenvif_get_strings(struct net_device *dev, u32 stringset, u8 * data) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index f9bcf4a665bc..602d408fa25e 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -214,7 +214,7 @@ static void xenvif_fatal_tx_err(struct xenvif *vif) netdev_err(vif->dev, "fatal error; disabling device\n"); vif->disabled = true; /* Disable the vif from queue 0's kthread */ - if (vif->queues) + if (vif->num_queues) xenvif_kick_thread(&vif->queues[0]); } diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index d2d7cd9145b1..a56d3eab35dd 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -495,26 +495,26 @@ static void backend_disconnect(struct backend_info *be) struct xenvif *vif = be->vif; if (vif) { + unsigned int num_queues = vif->num_queues; unsigned int queue_index; - struct xenvif_queue *queues; xen_unregister_watchers(vif); #ifdef CONFIG_DEBUG_FS xenvif_debugfs_delif(vif); #endif /* CONFIG_DEBUG_FS */ xenvif_disconnect_data(vif); - for (queue_index = 0; - queue_index < vif->num_queues; - ++queue_index) - xenvif_deinit_queue(&vif->queues[queue_index]); - spin_lock(&vif->lock); - queues = vif->queues; + /* At this point some of the handlers may still be active + * so we need to have additional synchronization here. + */ vif->num_queues = 0; - vif->queues = NULL; - spin_unlock(&vif->lock); + synchronize_net(); - vfree(queues); + for (queue_index = 0; queue_index < num_queues; ++queue_index) + xenvif_deinit_queue(&vif->queues[queue_index]); + + vfree(vif->queues); + vif->queues = NULL; xenvif_disconnect_ctrl(vif); } diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 5be4783e40d4..dea98ffb6f60 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -103,15 +103,6 @@ static struct quirk_entry quirk_asus_x200ca = { .wapf = 2, }; -static struct quirk_entry quirk_no_rfkill = { - .no_rfkill = true, -}; - -static struct quirk_entry quirk_no_rfkill_wapf4 = { - .wapf = 4, - .no_rfkill = true, -}; - static struct quirk_entry quirk_asus_ux303ub = { .wmi_backlight_native = true, }; @@ -194,7 +185,7 @@ static const struct dmi_system_id asus_quirks[] = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), DMI_MATCH(DMI_PRODUCT_NAME, "X456UA"), }, - .driver_data = &quirk_no_rfkill_wapf4, + .driver_data = &quirk_asus_wapf4, }, { .callback = dmi_matched, @@ -203,7 +194,7 @@ static const struct dmi_system_id asus_quirks[] = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), DMI_MATCH(DMI_PRODUCT_NAME, "X456UF"), }, - .driver_data = &quirk_no_rfkill_wapf4, + .driver_data = &quirk_asus_wapf4, }, { .callback = dmi_matched, @@ -369,42 +360,6 @@ static const struct dmi_system_id asus_quirks[] = { }, { .callback = dmi_matched, - .ident = "ASUSTeK COMPUTER INC. X555UB", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "X555UB"), - }, - .driver_data = &quirk_no_rfkill, - }, - { - .callback = dmi_matched, - .ident = "ASUSTeK COMPUTER INC. N552VW", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "N552VW"), - }, - .driver_data = &quirk_no_rfkill, - }, - { - .callback = dmi_matched, - .ident = "ASUSTeK COMPUTER INC. U303LB", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "U303LB"), - }, - .driver_data = &quirk_no_rfkill, - }, - { - .callback = dmi_matched, - .ident = "ASUSTeK COMPUTER INC. Z550MA", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), - DMI_MATCH(DMI_PRODUCT_NAME, "Z550MA"), - }, - .driver_data = &quirk_no_rfkill, - }, - { - .callback = dmi_matched, .ident = "ASUSTeK COMPUTER INC. UX303UB", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 43cb680adbb4..8fe5890bf539 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -159,6 +159,8 @@ MODULE_LICENSE("GPL"); #define USB_INTEL_XUSB2PR 0xD0 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI 0x9c31 +static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL }; + struct bios_args { u32 arg0; u32 arg1; @@ -2051,6 +2053,16 @@ static int asus_wmi_fan_init(struct asus_wmi *asus) return 0; } +static bool ashs_present(void) +{ + int i = 0; + while (ashs_ids[i]) { + if (acpi_dev_found(ashs_ids[i++])) + return true; + } + return false; +} + /* * WMI Driver */ @@ -2095,7 +2107,11 @@ static int asus_wmi_add(struct platform_device *pdev) if (err) goto fail_leds; - if (!asus->driver->quirks->no_rfkill) { + asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_WLAN, &result); + if (result & (ASUS_WMI_DSTS_PRESENCE_BIT | ASUS_WMI_DSTS_USER_BIT)) + asus->driver->wlan_ctrl_by_user = 1; + + if (!(asus->driver->wlan_ctrl_by_user && ashs_present())) { err = asus_wmi_rfkill_init(asus); if (err) goto fail_rfkill; @@ -2134,10 +2150,6 @@ static int asus_wmi_add(struct platform_device *pdev) if (err) goto fail_debugfs; - asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_WLAN, &result); - if (result & (ASUS_WMI_DSTS_PRESENCE_BIT | ASUS_WMI_DSTS_USER_BIT)) - asus->driver->wlan_ctrl_by_user = 1; - return 0; fail_debugfs: diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h index fdff626c3b51..c9589d9342bb 100644 --- a/drivers/platform/x86/asus-wmi.h +++ b/drivers/platform/x86/asus-wmi.h @@ -39,7 +39,6 @@ struct key_entry; struct asus_wmi; struct quirk_entry { - bool no_rfkill; bool hotplug_wireless; bool scalar_panel_brightness; bool store_backlight_power; diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index 2b218b1d13e5..e12cc3504d48 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -78,18 +78,18 @@ #define FUJITSU_LCD_N_LEVELS 8 -#define ACPI_FUJITSU_CLASS "fujitsu" -#define ACPI_FUJITSU_HID "FUJ02B1" -#define ACPI_FUJITSU_DRIVER_NAME "Fujitsu laptop FUJ02B1 ACPI brightness driver" -#define ACPI_FUJITSU_DEVICE_NAME "Fujitsu FUJ02B1" -#define ACPI_FUJITSU_HOTKEY_HID "FUJ02E3" -#define ACPI_FUJITSU_HOTKEY_DRIVER_NAME "Fujitsu laptop FUJ02E3 ACPI hotkeys driver" -#define ACPI_FUJITSU_HOTKEY_DEVICE_NAME "Fujitsu FUJ02E3" +#define ACPI_FUJITSU_CLASS "fujitsu" +#define ACPI_FUJITSU_BL_HID "FUJ02B1" +#define ACPI_FUJITSU_BL_DRIVER_NAME "Fujitsu laptop FUJ02B1 ACPI brightness driver" +#define ACPI_FUJITSU_BL_DEVICE_NAME "Fujitsu FUJ02B1" +#define ACPI_FUJITSU_LAPTOP_HID "FUJ02E3" +#define ACPI_FUJITSU_LAPTOP_DRIVER_NAME "Fujitsu laptop FUJ02E3 ACPI hotkeys driver" +#define ACPI_FUJITSU_LAPTOP_DEVICE_NAME "Fujitsu FUJ02E3" #define ACPI_FUJITSU_NOTIFY_CODE1 0x80 /* FUNC interface - command values */ -#define FUNC_RFKILL 0x1000 +#define FUNC_FLAGS 0x1000 #define FUNC_LEDS 0x1001 #define FUNC_BUTTONS 0x1002 #define FUNC_BACKLIGHT 0x1004 @@ -97,6 +97,11 @@ /* FUNC interface - responses */ #define UNSUPPORTED_CMD 0x80000000 +/* FUNC interface - status flags */ +#define FLAG_RFKILL 0x020 +#define FLAG_LID 0x100 +#define FLAG_DOCK 0x200 + #if IS_ENABLED(CONFIG_LEDS_CLASS) /* FUNC interface - LED control */ #define FUNC_LED_OFF 0x1 @@ -136,7 +141,7 @@ #endif /* Device controlling the backlight and associated keys */ -struct fujitsu_t { +struct fujitsu_bl { acpi_handle acpi_handle; struct acpi_device *dev; struct input_dev *input; @@ -150,12 +155,12 @@ struct fujitsu_t { unsigned int brightness_level; }; -static struct fujitsu_t *fujitsu; +static struct fujitsu_bl *fujitsu_bl; static int use_alt_lcd_levels = -1; static int disable_brightness_adjust = -1; -/* Device used to access other hotkeys on the laptop */ -struct fujitsu_hotkey_t { +/* Device used to access hotkeys and other features on the laptop */ +struct fujitsu_laptop { acpi_handle acpi_handle; struct acpi_device *dev; struct input_dev *input; @@ -163,17 +168,15 @@ struct fujitsu_hotkey_t { struct platform_device *pf_device; struct kfifo fifo; spinlock_t fifo_lock; - int rfkill_supported; - int rfkill_state; + int flags_supported; + int flags_state; int logolamp_registered; int kblamps_registered; int radio_led_registered; int eco_led_registered; }; -static struct fujitsu_hotkey_t *fujitsu_hotkey; - -static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event); +static struct fujitsu_laptop *fujitsu_laptop; #if IS_ENABLED(CONFIG_LEDS_CLASS) static enum led_brightness logolamp_get(struct led_classdev *cdev); @@ -222,8 +225,6 @@ static struct led_classdev eco_led = { static u32 dbg_level = 0x03; #endif -static void acpi_fujitsu_notify(struct acpi_device *device, u32 event); - /* Fujitsu ACPI interface function */ static int call_fext_func(int cmd, int arg0, int arg1, int arg2) @@ -239,7 +240,7 @@ static int call_fext_func(int cmd, int arg0, int arg1, int arg2) unsigned long long value; acpi_handle handle = NULL; - status = acpi_get_handle(fujitsu_hotkey->acpi_handle, "FUNC", &handle); + status = acpi_get_handle(fujitsu_laptop->acpi_handle, "FUNC", &handle); if (ACPI_FAILURE(status)) { vdbg_printk(FUJLAPTOP_DBG_ERROR, "FUNC interface is not present\n"); @@ -300,9 +301,9 @@ static int radio_led_set(struct led_classdev *cdev, enum led_brightness brightness) { if (brightness >= LED_FULL) - return call_fext_func(FUNC_RFKILL, 0x5, RADIO_LED_ON, RADIO_LED_ON); + return call_fext_func(FUNC_FLAGS, 0x5, RADIO_LED_ON, RADIO_LED_ON); else - return call_fext_func(FUNC_RFKILL, 0x5, RADIO_LED_ON, 0x0); + return call_fext_func(FUNC_FLAGS, 0x5, RADIO_LED_ON, 0x0); } static int eco_led_set(struct led_classdev *cdev, @@ -346,7 +347,7 @@ static enum led_brightness radio_led_get(struct led_classdev *cdev) { enum led_brightness brightness = LED_OFF; - if (call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0) & RADIO_LED_ON) + if (call_fext_func(FUNC_FLAGS, 0x4, 0x0, 0x0) & RADIO_LED_ON) brightness = LED_FULL; return brightness; @@ -373,10 +374,10 @@ static int set_lcd_level(int level) vdbg_printk(FUJLAPTOP_DBG_TRACE, "set lcd level via SBLL [%d]\n", level); - if (level < 0 || level >= fujitsu->max_brightness) + if (level < 0 || level >= fujitsu_bl->max_brightness) return -EINVAL; - status = acpi_get_handle(fujitsu->acpi_handle, "SBLL", &handle); + status = acpi_get_handle(fujitsu_bl->acpi_handle, "SBLL", &handle); if (ACPI_FAILURE(status)) { vdbg_printk(FUJLAPTOP_DBG_ERROR, "SBLL not present\n"); return -ENODEV; @@ -398,10 +399,10 @@ static int set_lcd_level_alt(int level) vdbg_printk(FUJLAPTOP_DBG_TRACE, "set lcd level via SBL2 [%d]\n", level); - if (level < 0 || level >= fujitsu->max_brightness) + if (level < 0 || level >= fujitsu_bl->max_brightness) return -EINVAL; - status = acpi_get_handle(fujitsu->acpi_handle, "SBL2", &handle); + status = acpi_get_handle(fujitsu_bl->acpi_handle, "SBL2", &handle); if (ACPI_FAILURE(status)) { vdbg_printk(FUJLAPTOP_DBG_ERROR, "SBL2 not present\n"); return -ENODEV; @@ -421,19 +422,19 @@ static int get_lcd_level(void) vdbg_printk(FUJLAPTOP_DBG_TRACE, "get lcd level via GBLL\n"); - status = - acpi_evaluate_integer(fujitsu->acpi_handle, "GBLL", NULL, &state); + status = acpi_evaluate_integer(fujitsu_bl->acpi_handle, "GBLL", NULL, + &state); if (ACPI_FAILURE(status)) return 0; - fujitsu->brightness_level = state & 0x0fffffff; + fujitsu_bl->brightness_level = state & 0x0fffffff; if (state & 0x80000000) - fujitsu->brightness_changed = 1; + fujitsu_bl->brightness_changed = 1; else - fujitsu->brightness_changed = 0; + fujitsu_bl->brightness_changed = 0; - return fujitsu->brightness_level; + return fujitsu_bl->brightness_level; } static int get_max_brightness(void) @@ -443,14 +444,14 @@ static int get_max_brightness(void) vdbg_printk(FUJLAPTOP_DBG_TRACE, "get max lcd level via RBLL\n"); - status = - acpi_evaluate_integer(fujitsu->acpi_handle, "RBLL", NULL, &state); + status = acpi_evaluate_integer(fujitsu_bl->acpi_handle, "RBLL", NULL, + &state); if (ACPI_FAILURE(status)) return -1; - fujitsu->max_brightness = state; + fujitsu_bl->max_brightness = state; - return fujitsu->max_brightness; + return fujitsu_bl->max_brightness; } /* Backlight device stuff */ @@ -483,7 +484,7 @@ static int bl_update_status(struct backlight_device *b) return ret; } -static const struct backlight_ops fujitsubl_ops = { +static const struct backlight_ops fujitsu_bl_ops = { .get_brightness = bl_get_brightness, .update_status = bl_update_status, }; @@ -511,7 +512,7 @@ show_brightness_changed(struct device *dev, int ret; - ret = fujitsu->brightness_changed; + ret = fujitsu_bl->brightness_changed; if (ret < 0) return ret; @@ -539,7 +540,7 @@ static ssize_t store_lcd_level(struct device *dev, int level, ret; if (sscanf(buf, "%i", &level) != 1 - || (level < 0 || level >= fujitsu->max_brightness)) + || (level < 0 || level >= fujitsu_bl->max_brightness)) return -EINVAL; if (use_alt_lcd_levels) @@ -567,9 +568,9 @@ static ssize_t show_lid_state(struct device *dev, struct device_attribute *attr, char *buf) { - if (!(fujitsu_hotkey->rfkill_supported & 0x100)) + if (!(fujitsu_laptop->flags_supported & FLAG_LID)) return sprintf(buf, "unknown\n"); - if (fujitsu_hotkey->rfkill_state & 0x100) + if (fujitsu_laptop->flags_state & FLAG_LID) return sprintf(buf, "open\n"); else return sprintf(buf, "closed\n"); @@ -579,9 +580,9 @@ static ssize_t show_dock_state(struct device *dev, struct device_attribute *attr, char *buf) { - if (!(fujitsu_hotkey->rfkill_supported & 0x200)) + if (!(fujitsu_laptop->flags_supported & FLAG_DOCK)) return sprintf(buf, "unknown\n"); - if (fujitsu_hotkey->rfkill_state & 0x200) + if (fujitsu_laptop->flags_state & FLAG_DOCK) return sprintf(buf, "docked\n"); else return sprintf(buf, "undocked\n"); @@ -591,9 +592,9 @@ static ssize_t show_radios_state(struct device *dev, struct device_attribute *attr, char *buf) { - if (!(fujitsu_hotkey->rfkill_supported & 0x20)) + if (!(fujitsu_laptop->flags_supported & FLAG_RFKILL)) return sprintf(buf, "unknown\n"); - if (fujitsu_hotkey->rfkill_state & 0x20) + if (fujitsu_laptop->flags_state & FLAG_RFKILL) return sprintf(buf, "on\n"); else return sprintf(buf, "killed\n"); @@ -607,7 +608,7 @@ static DEVICE_ATTR(lid, 0444, show_lid_state, ignore_store); static DEVICE_ATTR(dock, 0444, show_dock_state, ignore_store); static DEVICE_ATTR(radios, 0444, show_radios_state, ignore_store); -static struct attribute *fujitsupf_attributes[] = { +static struct attribute *fujitsu_pf_attributes[] = { &dev_attr_brightness_changed.attr, &dev_attr_max_brightness.attr, &dev_attr_lcd_level.attr, @@ -617,11 +618,11 @@ static struct attribute *fujitsupf_attributes[] = { NULL }; -static struct attribute_group fujitsupf_attribute_group = { - .attrs = fujitsupf_attributes +static struct attribute_group fujitsu_pf_attribute_group = { + .attrs = fujitsu_pf_attributes }; -static struct platform_driver fujitsupf_driver = { +static struct platform_driver fujitsu_pf_driver = { .driver = { .name = "fujitsu-laptop", } @@ -630,39 +631,30 @@ static struct platform_driver fujitsupf_driver = { static void __init dmi_check_cb_common(const struct dmi_system_id *id) { pr_info("Identified laptop model '%s'\n", id->ident); - if (use_alt_lcd_levels == -1) { - if (acpi_has_method(NULL, - "\\_SB.PCI0.LPCB.FJEX.SBL2")) - use_alt_lcd_levels = 1; - else - use_alt_lcd_levels = 0; - vdbg_printk(FUJLAPTOP_DBG_TRACE, "auto-detected usealt as " - "%i\n", use_alt_lcd_levels); - } } static int __init dmi_check_cb_s6410(const struct dmi_system_id *id) { dmi_check_cb_common(id); - fujitsu->keycode1 = KEY_SCREENLOCK; /* "Lock" */ - fujitsu->keycode2 = KEY_HELP; /* "Mobility Center" */ + fujitsu_bl->keycode1 = KEY_SCREENLOCK; /* "Lock" */ + fujitsu_bl->keycode2 = KEY_HELP; /* "Mobility Center" */ return 1; } static int __init dmi_check_cb_s6420(const struct dmi_system_id *id) { dmi_check_cb_common(id); - fujitsu->keycode1 = KEY_SCREENLOCK; /* "Lock" */ - fujitsu->keycode2 = KEY_HELP; /* "Mobility Center" */ + fujitsu_bl->keycode1 = KEY_SCREENLOCK; /* "Lock" */ + fujitsu_bl->keycode2 = KEY_HELP; /* "Mobility Center" */ return 1; } static int __init dmi_check_cb_p8010(const struct dmi_system_id *id) { dmi_check_cb_common(id); - fujitsu->keycode1 = KEY_HELP; /* "Support" */ - fujitsu->keycode3 = KEY_SWITCHVIDEOMODE; /* "Presentation" */ - fujitsu->keycode4 = KEY_WWW; /* "Internet" */ + fujitsu_bl->keycode1 = KEY_HELP; /* "Support" */ + fujitsu_bl->keycode3 = KEY_SWITCHVIDEOMODE; /* "Presentation" */ + fujitsu_bl->keycode4 = KEY_WWW; /* "Internet" */ return 1; } @@ -693,7 +685,7 @@ static const struct dmi_system_id fujitsu_dmi_table[] __initconst = { /* ACPI device for LCD brightness control */ -static int acpi_fujitsu_add(struct acpi_device *device) +static int acpi_fujitsu_bl_add(struct acpi_device *device) { int state = 0; struct input_dev *input; @@ -702,22 +694,22 @@ static int acpi_fujitsu_add(struct acpi_device *device) if (!device) return -EINVAL; - fujitsu->acpi_handle = device->handle; - sprintf(acpi_device_name(device), "%s", ACPI_FUJITSU_DEVICE_NAME); + fujitsu_bl->acpi_handle = device->handle; + sprintf(acpi_device_name(device), "%s", ACPI_FUJITSU_BL_DEVICE_NAME); sprintf(acpi_device_class(device), "%s", ACPI_FUJITSU_CLASS); - device->driver_data = fujitsu; + device->driver_data = fujitsu_bl; - fujitsu->input = input = input_allocate_device(); + fujitsu_bl->input = input = input_allocate_device(); if (!input) { error = -ENOMEM; goto err_stop; } - snprintf(fujitsu->phys, sizeof(fujitsu->phys), + snprintf(fujitsu_bl->phys, sizeof(fujitsu_bl->phys), "%s/video/input0", acpi_device_hid(device)); input->name = acpi_device_name(device); - input->phys = fujitsu->phys; + input->phys = fujitsu_bl->phys; input->id.bustype = BUS_HOST; input->id.product = 0x06; input->dev.parent = &device->dev; @@ -730,7 +722,7 @@ static int acpi_fujitsu_add(struct acpi_device *device) if (error) goto err_free_input_dev; - error = acpi_bus_update_power(fujitsu->acpi_handle, &state); + error = acpi_bus_update_power(fujitsu_bl->acpi_handle, &state); if (error) { pr_err("Error reading power state\n"); goto err_unregister_input_dev; @@ -740,7 +732,7 @@ static int acpi_fujitsu_add(struct acpi_device *device) acpi_device_name(device), acpi_device_bid(device), !device->power.state ? "on" : "off"); - fujitsu->dev = device; + fujitsu_bl->dev = device; if (acpi_has_method(device->handle, METHOD_NAME__INI)) { vdbg_printk(FUJLAPTOP_DBG_INFO, "Invoking _INI\n"); @@ -750,6 +742,15 @@ static int acpi_fujitsu_add(struct acpi_device *device) pr_err("_INI Method failed\n"); } + if (use_alt_lcd_levels == -1) { + if (acpi_has_method(NULL, "\\_SB.PCI0.LPCB.FJEX.SBL2")) + use_alt_lcd_levels = 1; + else + use_alt_lcd_levels = 0; + vdbg_printk(FUJLAPTOP_DBG_TRACE, "auto-detected usealt as %i\n", + use_alt_lcd_levels); + } + /* do config (detect defaults) */ use_alt_lcd_levels = use_alt_lcd_levels == 1 ? 1 : 0; disable_brightness_adjust = disable_brightness_adjust == 1 ? 1 : 0; @@ -758,7 +759,7 @@ static int acpi_fujitsu_add(struct acpi_device *device) use_alt_lcd_levels, disable_brightness_adjust); if (get_max_brightness() <= 0) - fujitsu->max_brightness = FUJITSU_LCD_N_LEVELS; + fujitsu_bl->max_brightness = FUJITSU_LCD_N_LEVELS; get_lcd_level(); return 0; @@ -772,38 +773,38 @@ err_stop: return error; } -static int acpi_fujitsu_remove(struct acpi_device *device) +static int acpi_fujitsu_bl_remove(struct acpi_device *device) { - struct fujitsu_t *fujitsu = acpi_driver_data(device); - struct input_dev *input = fujitsu->input; + struct fujitsu_bl *fujitsu_bl = acpi_driver_data(device); + struct input_dev *input = fujitsu_bl->input; input_unregister_device(input); - fujitsu->acpi_handle = NULL; + fujitsu_bl->acpi_handle = NULL; return 0; } /* Brightness notify */ -static void acpi_fujitsu_notify(struct acpi_device *device, u32 event) +static void acpi_fujitsu_bl_notify(struct acpi_device *device, u32 event) { struct input_dev *input; int keycode; int oldb, newb; - input = fujitsu->input; + input = fujitsu_bl->input; switch (event) { case ACPI_FUJITSU_NOTIFY_CODE1: keycode = 0; - oldb = fujitsu->brightness_level; + oldb = fujitsu_bl->brightness_level; get_lcd_level(); - newb = fujitsu->brightness_level; + newb = fujitsu_bl->brightness_level; vdbg_printk(FUJLAPTOP_DBG_TRACE, "brightness button event [%i -> %i (%i)]\n", - oldb, newb, fujitsu->brightness_changed); + oldb, newb, fujitsu_bl->brightness_changed); if (oldb < newb) { if (disable_brightness_adjust != 1) { @@ -840,7 +841,7 @@ static void acpi_fujitsu_notify(struct acpi_device *device, u32 event) /* ACPI device for hotkey handling */ -static int acpi_fujitsu_hotkey_add(struct acpi_device *device) +static int acpi_fujitsu_laptop_add(struct acpi_device *device) { int result = 0; int state = 0; @@ -851,42 +852,42 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) if (!device) return -EINVAL; - fujitsu_hotkey->acpi_handle = device->handle; + fujitsu_laptop->acpi_handle = device->handle; sprintf(acpi_device_name(device), "%s", - ACPI_FUJITSU_HOTKEY_DEVICE_NAME); + ACPI_FUJITSU_LAPTOP_DEVICE_NAME); sprintf(acpi_device_class(device), "%s", ACPI_FUJITSU_CLASS); - device->driver_data = fujitsu_hotkey; + device->driver_data = fujitsu_laptop; /* kfifo */ - spin_lock_init(&fujitsu_hotkey->fifo_lock); - error = kfifo_alloc(&fujitsu_hotkey->fifo, RINGBUFFERSIZE * sizeof(int), + spin_lock_init(&fujitsu_laptop->fifo_lock); + error = kfifo_alloc(&fujitsu_laptop->fifo, RINGBUFFERSIZE * sizeof(int), GFP_KERNEL); if (error) { pr_err("kfifo_alloc failed\n"); goto err_stop; } - fujitsu_hotkey->input = input = input_allocate_device(); + fujitsu_laptop->input = input = input_allocate_device(); if (!input) { error = -ENOMEM; goto err_free_fifo; } - snprintf(fujitsu_hotkey->phys, sizeof(fujitsu_hotkey->phys), + snprintf(fujitsu_laptop->phys, sizeof(fujitsu_laptop->phys), "%s/video/input0", acpi_device_hid(device)); input->name = acpi_device_name(device); - input->phys = fujitsu_hotkey->phys; + input->phys = fujitsu_laptop->phys; input->id.bustype = BUS_HOST; input->id.product = 0x06; input->dev.parent = &device->dev; set_bit(EV_KEY, input->evbit); - set_bit(fujitsu->keycode1, input->keybit); - set_bit(fujitsu->keycode2, input->keybit); - set_bit(fujitsu->keycode3, input->keybit); - set_bit(fujitsu->keycode4, input->keybit); - set_bit(fujitsu->keycode5, input->keybit); + set_bit(fujitsu_bl->keycode1, input->keybit); + set_bit(fujitsu_bl->keycode2, input->keybit); + set_bit(fujitsu_bl->keycode3, input->keybit); + set_bit(fujitsu_bl->keycode4, input->keybit); + set_bit(fujitsu_bl->keycode5, input->keybit); set_bit(KEY_TOUCHPAD_TOGGLE, input->keybit); set_bit(KEY_UNKNOWN, input->keybit); @@ -894,7 +895,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) if (error) goto err_free_input_dev; - error = acpi_bus_update_power(fujitsu_hotkey->acpi_handle, &state); + error = acpi_bus_update_power(fujitsu_laptop->acpi_handle, &state); if (error) { pr_err("Error reading power state\n"); goto err_unregister_input_dev; @@ -904,7 +905,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) acpi_device_name(device), acpi_device_bid(device), !device->power.state ? "on" : "off"); - fujitsu_hotkey->dev = device; + fujitsu_laptop->dev = device; if (acpi_has_method(device->handle, METHOD_NAME__INI)) { vdbg_printk(FUJLAPTOP_DBG_INFO, "Invoking _INI\n"); @@ -920,27 +921,27 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) ; /* No action, result is discarded */ vdbg_printk(FUJLAPTOP_DBG_INFO, "Discarded %i ringbuffer entries\n", i); - fujitsu_hotkey->rfkill_supported = - call_fext_func(FUNC_RFKILL, 0x0, 0x0, 0x0); + fujitsu_laptop->flags_supported = + call_fext_func(FUNC_FLAGS, 0x0, 0x0, 0x0); /* Make sure our bitmask of supported functions is cleared if the RFKILL function block is not implemented, like on the S7020. */ - if (fujitsu_hotkey->rfkill_supported == UNSUPPORTED_CMD) - fujitsu_hotkey->rfkill_supported = 0; + if (fujitsu_laptop->flags_supported == UNSUPPORTED_CMD) + fujitsu_laptop->flags_supported = 0; - if (fujitsu_hotkey->rfkill_supported) - fujitsu_hotkey->rfkill_state = - call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0); + if (fujitsu_laptop->flags_supported) + fujitsu_laptop->flags_state = + call_fext_func(FUNC_FLAGS, 0x4, 0x0, 0x0); /* Suspect this is a keymap of the application panel, print it */ pr_info("BTNI: [0x%x]\n", call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0)); #if IS_ENABLED(CONFIG_LEDS_CLASS) if (call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & LOGOLAMP_POWERON) { - result = led_classdev_register(&fujitsu->pf_device->dev, + result = led_classdev_register(&fujitsu_bl->pf_device->dev, &logolamp_led); if (result == 0) { - fujitsu_hotkey->logolamp_registered = 1; + fujitsu_laptop->logolamp_registered = 1; } else { pr_err("Could not register LED handler for logo lamp, error %i\n", result); @@ -949,10 +950,10 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) if ((call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & KEYBOARD_LAMPS) && (call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0) == 0x0)) { - result = led_classdev_register(&fujitsu->pf_device->dev, + result = led_classdev_register(&fujitsu_bl->pf_device->dev, &kblamps_led); if (result == 0) { - fujitsu_hotkey->kblamps_registered = 1; + fujitsu_laptop->kblamps_registered = 1; } else { pr_err("Could not register LED handler for keyboard lamps, error %i\n", result); @@ -966,10 +967,10 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) * that an RF LED is present. */ if (call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0) & BIT(24)) { - result = led_classdev_register(&fujitsu->pf_device->dev, + result = led_classdev_register(&fujitsu_bl->pf_device->dev, &radio_led); if (result == 0) { - fujitsu_hotkey->radio_led_registered = 1; + fujitsu_laptop->radio_led_registered = 1; } else { pr_err("Could not register LED handler for radio LED, error %i\n", result); @@ -983,10 +984,10 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) */ if ((call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & BIT(14)) && (call_fext_func(FUNC_LEDS, 0x2, ECO_LED, 0x0) != UNSUPPORTED_CMD)) { - result = led_classdev_register(&fujitsu->pf_device->dev, + result = led_classdev_register(&fujitsu_bl->pf_device->dev, &eco_led); if (result == 0) { - fujitsu_hotkey->eco_led_registered = 1; + fujitsu_laptop->eco_led_registered = 1; } else { pr_err("Could not register LED handler for eco LED, error %i\n", result); @@ -1002,47 +1003,47 @@ err_unregister_input_dev: err_free_input_dev: input_free_device(input); err_free_fifo: - kfifo_free(&fujitsu_hotkey->fifo); + kfifo_free(&fujitsu_laptop->fifo); err_stop: return error; } -static int acpi_fujitsu_hotkey_remove(struct acpi_device *device) +static int acpi_fujitsu_laptop_remove(struct acpi_device *device) { - struct fujitsu_hotkey_t *fujitsu_hotkey = acpi_driver_data(device); - struct input_dev *input = fujitsu_hotkey->input; + struct fujitsu_laptop *fujitsu_laptop = acpi_driver_data(device); + struct input_dev *input = fujitsu_laptop->input; #if IS_ENABLED(CONFIG_LEDS_CLASS) - if (fujitsu_hotkey->logolamp_registered) + if (fujitsu_laptop->logolamp_registered) led_classdev_unregister(&logolamp_led); - if (fujitsu_hotkey->kblamps_registered) + if (fujitsu_laptop->kblamps_registered) led_classdev_unregister(&kblamps_led); - if (fujitsu_hotkey->radio_led_registered) + if (fujitsu_laptop->radio_led_registered) led_classdev_unregister(&radio_led); - if (fujitsu_hotkey->eco_led_registered) + if (fujitsu_laptop->eco_led_registered) led_classdev_unregister(&eco_led); #endif input_unregister_device(input); - kfifo_free(&fujitsu_hotkey->fifo); + kfifo_free(&fujitsu_laptop->fifo); - fujitsu_hotkey->acpi_handle = NULL; + fujitsu_laptop->acpi_handle = NULL; return 0; } -static void acpi_fujitsu_hotkey_press(int keycode) +static void acpi_fujitsu_laptop_press(int keycode) { - struct input_dev *input = fujitsu_hotkey->input; + struct input_dev *input = fujitsu_laptop->input; int status; - status = kfifo_in_locked(&fujitsu_hotkey->fifo, + status = kfifo_in_locked(&fujitsu_laptop->fifo, (unsigned char *)&keycode, sizeof(keycode), - &fujitsu_hotkey->fifo_lock); + &fujitsu_laptop->fifo_lock); if (status != sizeof(keycode)) { vdbg_printk(FUJLAPTOP_DBG_WARN, "Could not push keycode [0x%x]\n", keycode); @@ -1054,16 +1055,16 @@ static void acpi_fujitsu_hotkey_press(int keycode) "Push keycode into ringbuffer [%d]\n", keycode); } -static void acpi_fujitsu_hotkey_release(void) +static void acpi_fujitsu_laptop_release(void) { - struct input_dev *input = fujitsu_hotkey->input; + struct input_dev *input = fujitsu_laptop->input; int keycode, status; while (true) { - status = kfifo_out_locked(&fujitsu_hotkey->fifo, + status = kfifo_out_locked(&fujitsu_laptop->fifo, (unsigned char *)&keycode, sizeof(keycode), - &fujitsu_hotkey->fifo_lock); + &fujitsu_laptop->fifo_lock); if (status != sizeof(keycode)) return; input_report_key(input, keycode, 0); @@ -1073,14 +1074,14 @@ static void acpi_fujitsu_hotkey_release(void) } } -static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) +static void acpi_fujitsu_laptop_notify(struct acpi_device *device, u32 event) { struct input_dev *input; int keycode; unsigned int irb = 1; int i; - input = fujitsu_hotkey->input; + input = fujitsu_laptop->input; if (event != ACPI_FUJITSU_NOTIFY_CODE1) { keycode = KEY_UNKNOWN; @@ -1093,9 +1094,9 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) return; } - if (fujitsu_hotkey->rfkill_supported) - fujitsu_hotkey->rfkill_state = - call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0); + if (fujitsu_laptop->flags_supported) + fujitsu_laptop->flags_state = + call_fext_func(FUNC_FLAGS, 0x4, 0x0, 0x0); i = 0; while ((irb = @@ -1103,19 +1104,19 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) && (i++) < MAX_HOTKEY_RINGBUFFER_SIZE) { switch (irb & 0x4ff) { case KEY1_CODE: - keycode = fujitsu->keycode1; + keycode = fujitsu_bl->keycode1; break; case KEY2_CODE: - keycode = fujitsu->keycode2; + keycode = fujitsu_bl->keycode2; break; case KEY3_CODE: - keycode = fujitsu->keycode3; + keycode = fujitsu_bl->keycode3; break; case KEY4_CODE: - keycode = fujitsu->keycode4; + keycode = fujitsu_bl->keycode4; break; case KEY5_CODE: - keycode = fujitsu->keycode5; + keycode = fujitsu_bl->keycode5; break; case 0: keycode = 0; @@ -1128,17 +1129,17 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) } if (keycode > 0) - acpi_fujitsu_hotkey_press(keycode); + acpi_fujitsu_laptop_press(keycode); else if (keycode == 0) - acpi_fujitsu_hotkey_release(); + acpi_fujitsu_laptop_release(); } /* On some models (first seen on the Skylake-based Lifebook * E736/E746/E756), the touchpad toggle hotkey (Fn+F4) is - * handled in software; its state is queried using FUNC_RFKILL + * handled in software; its state is queried using FUNC_FLAGS */ - if ((fujitsu_hotkey->rfkill_supported & BIT(26)) && - (call_fext_func(FUNC_RFKILL, 0x1, 0x0, 0x0) & BIT(26))) { + if ((fujitsu_laptop->flags_supported & BIT(26)) && + (call_fext_func(FUNC_FLAGS, 0x1, 0x0, 0x0) & BIT(26))) { keycode = KEY_TOUCHPAD_TOGGLE; input_report_key(input, keycode, 1); input_sync(input); @@ -1150,83 +1151,81 @@ static void acpi_fujitsu_hotkey_notify(struct acpi_device *device, u32 event) /* Initialization */ -static const struct acpi_device_id fujitsu_device_ids[] = { - {ACPI_FUJITSU_HID, 0}, +static const struct acpi_device_id fujitsu_bl_device_ids[] = { + {ACPI_FUJITSU_BL_HID, 0}, {"", 0}, }; -static struct acpi_driver acpi_fujitsu_driver = { - .name = ACPI_FUJITSU_DRIVER_NAME, +static struct acpi_driver acpi_fujitsu_bl_driver = { + .name = ACPI_FUJITSU_BL_DRIVER_NAME, .class = ACPI_FUJITSU_CLASS, - .ids = fujitsu_device_ids, + .ids = fujitsu_bl_device_ids, .ops = { - .add = acpi_fujitsu_add, - .remove = acpi_fujitsu_remove, - .notify = acpi_fujitsu_notify, + .add = acpi_fujitsu_bl_add, + .remove = acpi_fujitsu_bl_remove, + .notify = acpi_fujitsu_bl_notify, }, }; -static const struct acpi_device_id fujitsu_hotkey_device_ids[] = { - {ACPI_FUJITSU_HOTKEY_HID, 0}, +static const struct acpi_device_id fujitsu_laptop_device_ids[] = { + {ACPI_FUJITSU_LAPTOP_HID, 0}, {"", 0}, }; -static struct acpi_driver acpi_fujitsu_hotkey_driver = { - .name = ACPI_FUJITSU_HOTKEY_DRIVER_NAME, +static struct acpi_driver acpi_fujitsu_laptop_driver = { + .name = ACPI_FUJITSU_LAPTOP_DRIVER_NAME, .class = ACPI_FUJITSU_CLASS, - .ids = fujitsu_hotkey_device_ids, + .ids = fujitsu_laptop_device_ids, .ops = { - .add = acpi_fujitsu_hotkey_add, - .remove = acpi_fujitsu_hotkey_remove, - .notify = acpi_fujitsu_hotkey_notify, + .add = acpi_fujitsu_laptop_add, + .remove = acpi_fujitsu_laptop_remove, + .notify = acpi_fujitsu_laptop_notify, }, }; static const struct acpi_device_id fujitsu_ids[] __used = { - {ACPI_FUJITSU_HID, 0}, - {ACPI_FUJITSU_HOTKEY_HID, 0}, + {ACPI_FUJITSU_BL_HID, 0}, + {ACPI_FUJITSU_LAPTOP_HID, 0}, {"", 0} }; MODULE_DEVICE_TABLE(acpi, fujitsu_ids); static int __init fujitsu_init(void) { - int ret, result, max_brightness; + int ret, max_brightness; if (acpi_disabled) return -ENODEV; - fujitsu = kzalloc(sizeof(struct fujitsu_t), GFP_KERNEL); - if (!fujitsu) + fujitsu_bl = kzalloc(sizeof(struct fujitsu_bl), GFP_KERNEL); + if (!fujitsu_bl) return -ENOMEM; - fujitsu->keycode1 = KEY_PROG1; - fujitsu->keycode2 = KEY_PROG2; - fujitsu->keycode3 = KEY_PROG3; - fujitsu->keycode4 = KEY_PROG4; - fujitsu->keycode5 = KEY_RFKILL; + fujitsu_bl->keycode1 = KEY_PROG1; + fujitsu_bl->keycode2 = KEY_PROG2; + fujitsu_bl->keycode3 = KEY_PROG3; + fujitsu_bl->keycode4 = KEY_PROG4; + fujitsu_bl->keycode5 = KEY_RFKILL; dmi_check_system(fujitsu_dmi_table); - result = acpi_bus_register_driver(&acpi_fujitsu_driver); - if (result < 0) { - ret = -ENODEV; + ret = acpi_bus_register_driver(&acpi_fujitsu_bl_driver); + if (ret) goto fail_acpi; - } /* Register platform stuff */ - fujitsu->pf_device = platform_device_alloc("fujitsu-laptop", -1); - if (!fujitsu->pf_device) { + fujitsu_bl->pf_device = platform_device_alloc("fujitsu-laptop", -1); + if (!fujitsu_bl->pf_device) { ret = -ENOMEM; goto fail_platform_driver; } - ret = platform_device_add(fujitsu->pf_device); + ret = platform_device_add(fujitsu_bl->pf_device); if (ret) goto fail_platform_device1; ret = - sysfs_create_group(&fujitsu->pf_device->dev.kobj, - &fujitsupf_attribute_group); + sysfs_create_group(&fujitsu_bl->pf_device->dev.kobj, + &fujitsu_pf_attribute_group); if (ret) goto fail_platform_device2; @@ -1236,90 +1235,88 @@ static int __init fujitsu_init(void) struct backlight_properties props; memset(&props, 0, sizeof(struct backlight_properties)); - max_brightness = fujitsu->max_brightness; + max_brightness = fujitsu_bl->max_brightness; props.type = BACKLIGHT_PLATFORM; props.max_brightness = max_brightness - 1; - fujitsu->bl_device = backlight_device_register("fujitsu-laptop", - NULL, NULL, - &fujitsubl_ops, - &props); - if (IS_ERR(fujitsu->bl_device)) { - ret = PTR_ERR(fujitsu->bl_device); - fujitsu->bl_device = NULL; + fujitsu_bl->bl_device = backlight_device_register("fujitsu-laptop", + NULL, NULL, + &fujitsu_bl_ops, + &props); + if (IS_ERR(fujitsu_bl->bl_device)) { + ret = PTR_ERR(fujitsu_bl->bl_device); + fujitsu_bl->bl_device = NULL; goto fail_sysfs_group; } - fujitsu->bl_device->props.brightness = fujitsu->brightness_level; + fujitsu_bl->bl_device->props.brightness = fujitsu_bl->brightness_level; } - ret = platform_driver_register(&fujitsupf_driver); + ret = platform_driver_register(&fujitsu_pf_driver); if (ret) goto fail_backlight; - /* Register hotkey driver */ + /* Register laptop driver */ - fujitsu_hotkey = kzalloc(sizeof(struct fujitsu_hotkey_t), GFP_KERNEL); - if (!fujitsu_hotkey) { + fujitsu_laptop = kzalloc(sizeof(struct fujitsu_laptop), GFP_KERNEL); + if (!fujitsu_laptop) { ret = -ENOMEM; - goto fail_hotkey; + goto fail_laptop; } - result = acpi_bus_register_driver(&acpi_fujitsu_hotkey_driver); - if (result < 0) { - ret = -ENODEV; - goto fail_hotkey1; - } + ret = acpi_bus_register_driver(&acpi_fujitsu_laptop_driver); + if (ret) + goto fail_laptop1; /* Sync backlight power status (needs FUJ02E3 device, hence deferred) */ if (acpi_video_get_backlight_type() == acpi_backlight_vendor) { if (call_fext_func(FUNC_BACKLIGHT, 0x2, 0x4, 0x0) == 3) - fujitsu->bl_device->props.power = FB_BLANK_POWERDOWN; + fujitsu_bl->bl_device->props.power = FB_BLANK_POWERDOWN; else - fujitsu->bl_device->props.power = FB_BLANK_UNBLANK; + fujitsu_bl->bl_device->props.power = FB_BLANK_UNBLANK; } pr_info("driver " FUJITSU_DRIVER_VERSION " successfully loaded\n"); return 0; -fail_hotkey1: - kfree(fujitsu_hotkey); -fail_hotkey: - platform_driver_unregister(&fujitsupf_driver); +fail_laptop1: + kfree(fujitsu_laptop); +fail_laptop: + platform_driver_unregister(&fujitsu_pf_driver); fail_backlight: - backlight_device_unregister(fujitsu->bl_device); + backlight_device_unregister(fujitsu_bl->bl_device); fail_sysfs_group: - sysfs_remove_group(&fujitsu->pf_device->dev.kobj, - &fujitsupf_attribute_group); + sysfs_remove_group(&fujitsu_bl->pf_device->dev.kobj, + &fujitsu_pf_attribute_group); fail_platform_device2: - platform_device_del(fujitsu->pf_device); + platform_device_del(fujitsu_bl->pf_device); fail_platform_device1: - platform_device_put(fujitsu->pf_device); + platform_device_put(fujitsu_bl->pf_device); fail_platform_driver: - acpi_bus_unregister_driver(&acpi_fujitsu_driver); + acpi_bus_unregister_driver(&acpi_fujitsu_bl_driver); fail_acpi: - kfree(fujitsu); + kfree(fujitsu_bl); return ret; } static void __exit fujitsu_cleanup(void) { - acpi_bus_unregister_driver(&acpi_fujitsu_hotkey_driver); + acpi_bus_unregister_driver(&acpi_fujitsu_laptop_driver); - kfree(fujitsu_hotkey); + kfree(fujitsu_laptop); - platform_driver_unregister(&fujitsupf_driver); + platform_driver_unregister(&fujitsu_pf_driver); - backlight_device_unregister(fujitsu->bl_device); + backlight_device_unregister(fujitsu_bl->bl_device); - sysfs_remove_group(&fujitsu->pf_device->dev.kobj, - &fujitsupf_attribute_group); + sysfs_remove_group(&fujitsu_bl->pf_device->dev.kobj, + &fujitsu_pf_attribute_group); - platform_device_unregister(fujitsu->pf_device); + platform_device_unregister(fujitsu_bl->pf_device); - acpi_bus_unregister_driver(&acpi_fujitsu_driver); + acpi_bus_unregister_driver(&acpi_fujitsu_bl_driver); - kfree(fujitsu); + kfree(fujitsu_bl); pr_info("driver unloaded\n"); } @@ -1341,7 +1338,3 @@ MODULE_AUTHOR("Jonathan Woithe, Peter Gruber, Tony Vroon"); MODULE_DESCRIPTION("Fujitsu laptop extras support"); MODULE_VERSION(FUJITSU_DRIVER_VERSION); MODULE_LICENSE("GPL"); - -MODULE_ALIAS("dmi:*:svnFUJITSUSIEMENS:*:pvr:rvnFUJITSU:rnFJNB1D3:*:cvrS6410:*"); -MODULE_ALIAS("dmi:*:svnFUJITSUSIEMENS:*:pvr:rvnFUJITSU:rnFJNB1E6:*:cvrS6420:*"); -MODULE_ALIAS("dmi:*:svnFUJITSU:*:pvr:rvnFUJITSU:rnFJNB19C:*:cvrS7020:*"); diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index 438d4c72c7b3..ff563db025b3 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -54,7 +54,7 @@ __asm__(".text \n" #define Q2_SET_SEL(cpu, selname, address, size) \ do { \ - struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \ + struct desc_struct *gdt = get_cpu_gdt_rw((cpu)); \ set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \ set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) @@ -95,8 +95,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3, return PNP_FUNCTION_NOT_SUPPORTED; cpu = get_cpu(); - save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8]; - get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc; + save_desc_40 = get_cpu_gdt_rw(cpu)[0x40 / 8]; + get_cpu_gdt_rw(cpu)[0x40 / 8] = bad_bios_desc; /* On some boxes IRQ's during PnP BIOS calls are deadly. */ spin_lock_irqsave(&pnp_bios_lock, flags); @@ -134,7 +134,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3, :"memory"); spin_unlock_irqrestore(&pnp_bios_lock, flags); - get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40; + get_cpu_gdt_rw(cpu)[0x40 / 8] = save_desc_40; put_cpu(); /* If we get here and this is set then the PnP BIOS faulted on us. */ @@ -477,7 +477,7 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) pnp_bios_callpoint.segment = PNP_CS16; for_each_possible_cpu(i) { - struct desc_struct *gdt = get_cpu_gdt_table(i); + struct desc_struct *gdt = get_cpu_gdt_rw(i); if (!gdt) continue; set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32], diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 230043c1c90f..4bf55b5d78be 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -1241,19 +1241,32 @@ config SCSI_LPFC tristate "Emulex LightPulse Fibre Channel Support" depends on PCI && SCSI depends on SCSI_FC_ATTRS - depends on NVME_FC && NVME_TARGET_FC select CRC_T10DIF - help + ---help--- This lpfc driver supports the Emulex LightPulse Family of Fibre Channel PCI host adapters. config SCSI_LPFC_DEBUG_FS bool "Emulex LightPulse Fibre Channel debugfs Support" depends on SCSI_LPFC && DEBUG_FS - help + ---help--- This makes debugging information from the lpfc driver available via the debugfs filesystem. +config LPFC_NVME_INITIATOR + bool "Emulex LightPulse Fibre Channel NVME Initiator Support" + depends on SCSI_LPFC && NVME_FC + ---help--- + This enables NVME Initiator support in the Emulex lpfc driver. + +config LPFC_NVME_TARGET + bool "Emulex LightPulse Fibre Channel NVME Initiator Support" + depends on SCSI_LPFC && NVME_TARGET_FC + ---help--- + This enables NVME Target support in the Emulex lpfc driver. + Target enablement must still be enabled on a per adapter + basis by module parameters. + config SCSI_SIM710 tristate "Simple 53c710 SCSI support (Compaq, NCR machines)" depends on (EISA || MCA) && SCSI diff --git a/drivers/scsi/aacraid/src.c b/drivers/scsi/aacraid/src.c index 2e5338dec621..7b0410e0f569 100644 --- a/drivers/scsi/aacraid/src.c +++ b/drivers/scsi/aacraid/src.c @@ -468,7 +468,7 @@ err_out: return -1; err_blink: - return (status > 16) & 0xFF; + return (status >> 16) & 0xFF; } static inline u32 aac_get_vector(struct aac_dev *dev) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 07c08ce68d70..894b1e3ebd56 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -561,8 +561,12 @@ static void iscsi_complete_task(struct iscsi_task *task, int state) WARN_ON_ONCE(task->state == ISCSI_TASK_FREE); task->state = state; - if (!list_empty(&task->running)) + spin_lock_bh(&conn->taskqueuelock); + if (!list_empty(&task->running)) { + pr_debug_once("%s while task on list", __func__); list_del_init(&task->running); + } + spin_unlock_bh(&conn->taskqueuelock); if (conn->task == task) conn->task = NULL; @@ -784,7 +788,9 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr, if (session->tt->xmit_task(task)) goto free_task; } else { + spin_lock_bh(&conn->taskqueuelock); list_add_tail(&task->running, &conn->mgmtqueue); + spin_unlock_bh(&conn->taskqueuelock); iscsi_conn_queue_work(conn); } @@ -1475,8 +1481,10 @@ void iscsi_requeue_task(struct iscsi_task *task) * this may be on the requeue list already if the xmit_task callout * is handling the r2ts while we are adding new ones */ + spin_lock_bh(&conn->taskqueuelock); if (list_empty(&task->running)) list_add_tail(&task->running, &conn->requeue); + spin_unlock_bh(&conn->taskqueuelock); iscsi_conn_queue_work(conn); } EXPORT_SYMBOL_GPL(iscsi_requeue_task); @@ -1513,22 +1521,26 @@ static int iscsi_data_xmit(struct iscsi_conn *conn) * only have one nop-out as a ping from us and targets should not * overflow us with nop-ins */ + spin_lock_bh(&conn->taskqueuelock); check_mgmt: while (!list_empty(&conn->mgmtqueue)) { conn->task = list_entry(conn->mgmtqueue.next, struct iscsi_task, running); list_del_init(&conn->task->running); + spin_unlock_bh(&conn->taskqueuelock); if (iscsi_prep_mgmt_task(conn, conn->task)) { /* regular RX path uses back_lock */ spin_lock_bh(&conn->session->back_lock); __iscsi_put_task(conn->task); spin_unlock_bh(&conn->session->back_lock); conn->task = NULL; + spin_lock_bh(&conn->taskqueuelock); continue; } rc = iscsi_xmit_task(conn); if (rc) goto done; + spin_lock_bh(&conn->taskqueuelock); } /* process pending command queue */ @@ -1536,19 +1548,24 @@ check_mgmt: conn->task = list_entry(conn->cmdqueue.next, struct iscsi_task, running); list_del_init(&conn->task->running); + spin_unlock_bh(&conn->taskqueuelock); if (conn->session->state == ISCSI_STATE_LOGGING_OUT) { fail_scsi_task(conn->task, DID_IMM_RETRY); + spin_lock_bh(&conn->taskqueuelock); continue; } rc = iscsi_prep_scsi_cmd_pdu(conn->task); if (rc) { if (rc == -ENOMEM || rc == -EACCES) { + spin_lock_bh(&conn->taskqueuelock); list_add_tail(&conn->task->running, &conn->cmdqueue); conn->task = NULL; + spin_unlock_bh(&conn->taskqueuelock); goto done; } else fail_scsi_task(conn->task, DID_ABORT); + spin_lock_bh(&conn->taskqueuelock); continue; } rc = iscsi_xmit_task(conn); @@ -1559,6 +1576,7 @@ check_mgmt: * we need to check the mgmt queue for nops that need to * be sent to aviod starvation */ + spin_lock_bh(&conn->taskqueuelock); if (!list_empty(&conn->mgmtqueue)) goto check_mgmt; } @@ -1578,12 +1596,15 @@ check_mgmt: conn->task = task; list_del_init(&conn->task->running); conn->task->state = ISCSI_TASK_RUNNING; + spin_unlock_bh(&conn->taskqueuelock); rc = iscsi_xmit_task(conn); if (rc) goto done; + spin_lock_bh(&conn->taskqueuelock); if (!list_empty(&conn->mgmtqueue)) goto check_mgmt; } + spin_unlock_bh(&conn->taskqueuelock); spin_unlock_bh(&conn->session->frwd_lock); return -ENODATA; @@ -1739,7 +1760,9 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc) goto prepd_reject; } } else { + spin_lock_bh(&conn->taskqueuelock); list_add_tail(&task->running, &conn->cmdqueue); + spin_unlock_bh(&conn->taskqueuelock); iscsi_conn_queue_work(conn); } @@ -2897,6 +2920,7 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, INIT_LIST_HEAD(&conn->mgmtqueue); INIT_LIST_HEAD(&conn->cmdqueue); INIT_LIST_HEAD(&conn->requeue); + spin_lock_init(&conn->taskqueuelock); INIT_WORK(&conn->xmitwork, iscsi_xmitworker); /* allocate login_task used for the login/text sequences */ diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index 0bba2e30b4f0..257bbdd0f0b8 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -99,12 +99,13 @@ struct lpfc_sli2_slim; #define FC_MAX_ADPTMSG 64 #define MAX_HBAEVT 32 +#define MAX_HBAS_NO_RESET 16 /* Number of MSI-X vectors the driver uses */ #define LPFC_MSIX_VECTORS 2 /* lpfc wait event data ready flag */ -#define LPFC_DATA_READY (1<<0) +#define LPFC_DATA_READY 0 /* bit 0 */ /* queue dump line buffer size */ #define LPFC_LBUF_SZ 128 @@ -692,6 +693,7 @@ struct lpfc_hba { * capability */ #define HBA_NVME_IOQ_FLUSH 0x80000 /* NVME IO queues flushed. */ +#define NVME_XRI_ABORT_EVENT 0x100000 uint32_t fcp_ring_in_use; /* When polling test if intr-hndlr active*/ struct lpfc_dmabuf slim2p; diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 5c783ef7f260..5c3be3e6f5e2 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -3010,6 +3010,12 @@ MODULE_PARM_DESC(lpfc_poll, "FCP ring polling mode control:" static DEVICE_ATTR(lpfc_poll, S_IRUGO | S_IWUSR, lpfc_poll_show, lpfc_poll_store); +int lpfc_no_hba_reset_cnt; +unsigned long lpfc_no_hba_reset[MAX_HBAS_NO_RESET] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +module_param_array(lpfc_no_hba_reset, ulong, &lpfc_no_hba_reset_cnt, 0444); +MODULE_PARM_DESC(lpfc_no_hba_reset, "WWPN of HBAs that should not be reset"); + LPFC_ATTR(sli_mode, 0, 0, 3, "SLI mode selector:" " 0 - auto (SLI-3 if supported)," @@ -4451,7 +4457,8 @@ lpfc_fcp_imax_store(struct device *dev, struct device_attribute *attr, return -EINVAL; phba->cfg_fcp_imax = (uint32_t)val; - for (i = 0; i < phba->io_channel_irqs; i++) + + for (i = 0; i < phba->io_channel_irqs; i += LPFC_MAX_EQ_DELAY_EQID_CNT) lpfc_modify_hba_eq_delay(phba, i); return strlen(buf); diff --git a/drivers/scsi/lpfc/lpfc_crtn.h b/drivers/scsi/lpfc/lpfc_crtn.h index 843dd73004da..54e6ac42fbcd 100644 --- a/drivers/scsi/lpfc/lpfc_crtn.h +++ b/drivers/scsi/lpfc/lpfc_crtn.h @@ -384,7 +384,7 @@ void lpfc_free_sysfs_attr(struct lpfc_vport *); extern struct device_attribute *lpfc_hba_attrs[]; extern struct device_attribute *lpfc_vport_attrs[]; extern struct scsi_host_template lpfc_template; -extern struct scsi_host_template lpfc_template_s3; +extern struct scsi_host_template lpfc_template_no_hr; extern struct scsi_host_template lpfc_template_nvme; extern struct scsi_host_template lpfc_vport_template; extern struct fc_function_template lpfc_transport_functions; @@ -554,3 +554,5 @@ void lpfc_nvme_abort_fcreq_cmpl(struct lpfc_hba *phba, struct lpfc_wcqe_complete *abts_cmpl); extern int lpfc_enable_nvmet_cnt; extern unsigned long long lpfc_enable_nvmet[]; +extern int lpfc_no_hba_reset_cnt; +extern unsigned long lpfc_no_hba_reset[]; diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index c22bb3f887e1..d3e9af983015 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -939,8 +939,8 @@ lpfc_cmpl_ct_cmd_gft_id(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, "FC4 x%08x, Data: x%08x x%08x\n", ndlp, did, ndlp->nlp_fc4_type, FC_TYPE_FCP, FC_TYPE_NVME); + ndlp->nlp_prev_state = NLP_STE_REG_LOGIN_ISSUE; } - ndlp->nlp_prev_state = NLP_STE_REG_LOGIN_ISSUE; lpfc_nlp_set_state(vport, ndlp, NLP_STE_PRLI_ISSUE); lpfc_issue_els_prli(vport, ndlp, 0); } else diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 9f4798e9d938..913eed822cb8 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -3653,17 +3653,6 @@ lpfc_idiag_queacc_write(struct file *file, const char __user *buf, idiag.ptr_private = phba->sli4_hba.nvmels_cq; goto pass_check; } - /* NVME LS complete queue */ - if (phba->sli4_hba.nvmels_cq && - phba->sli4_hba.nvmels_cq->queue_id == queid) { - /* Sanity check */ - rc = lpfc_idiag_que_param_check( - phba->sli4_hba.nvmels_cq, index, count); - if (rc) - goto error_out; - idiag.ptr_private = phba->sli4_hba.nvmels_cq; - goto pass_check; - } /* FCP complete queue */ if (phba->sli4_hba.fcp_cq) { for (qidx = 0; qidx < phba->cfg_fcp_io_channel; @@ -3738,17 +3727,6 @@ lpfc_idiag_queacc_write(struct file *file, const char __user *buf, idiag.ptr_private = phba->sli4_hba.nvmels_wq; goto pass_check; } - /* NVME LS work queue */ - if (phba->sli4_hba.nvmels_wq && - phba->sli4_hba.nvmels_wq->queue_id == queid) { - /* Sanity check */ - rc = lpfc_idiag_que_param_check( - phba->sli4_hba.nvmels_wq, index, count); - if (rc) - goto error_out; - idiag.ptr_private = phba->sli4_hba.nvmels_wq; - goto pass_check; - } /* FCP work queue */ if (phba->sli4_hba.fcp_wq) { for (qidx = 0; qidx < phba->cfg_fcp_io_channel; diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 2d26440e6f2f..d9c61d030034 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -5177,15 +5177,15 @@ lpfc_rdp_res_speed(struct fc_rdp_port_speed_desc *desc, struct lpfc_hba *phba) static uint32_t lpfc_rdp_res_diag_port_names(struct fc_rdp_port_name_desc *desc, - struct lpfc_hba *phba) + struct lpfc_vport *vport) { desc->tag = cpu_to_be32(RDP_PORT_NAMES_DESC_TAG); - memcpy(desc->port_names.wwnn, phba->wwnn, + memcpy(desc->port_names.wwnn, &vport->fc_nodename, sizeof(desc->port_names.wwnn)); - memcpy(desc->port_names.wwpn, phba->wwpn, + memcpy(desc->port_names.wwpn, &vport->fc_portname, sizeof(desc->port_names.wwpn)); desc->length = cpu_to_be32(sizeof(desc->port_names)); @@ -5279,7 +5279,7 @@ lpfc_els_rdp_cmpl(struct lpfc_hba *phba, struct lpfc_rdp_context *rdp_context, len += lpfc_rdp_res_link_error((struct fc_rdp_link_error_status_desc *) (len + pcmd), &rdp_context->link_stat); len += lpfc_rdp_res_diag_port_names((struct fc_rdp_port_name_desc *) - (len + pcmd), phba); + (len + pcmd), vport); len += lpfc_rdp_res_attach_port_names((struct fc_rdp_port_name_desc *) (len + pcmd), vport, ndlp); len += lpfc_rdp_res_fec_desc((struct fc_fec_rdp_desc *)(len + pcmd), @@ -8371,11 +8371,17 @@ lpfc_cmpl_reg_new_vport(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb) spin_lock_irq(shost->host_lock); vport->fc_flag |= FC_VPORT_NEEDS_REG_VPI; spin_unlock_irq(shost->host_lock); - if (vport->port_type == LPFC_PHYSICAL_PORT - && !(vport->fc_flag & FC_LOGO_RCVD_DID_CHNG)) - lpfc_issue_init_vfi(vport); - else + if (mb->mbxStatus == MBX_NOT_FINISHED) + break; + if ((vport->port_type == LPFC_PHYSICAL_PORT) && + !(vport->fc_flag & FC_LOGO_RCVD_DID_CHNG)) { + if (phba->sli_rev == LPFC_SLI_REV4) + lpfc_issue_init_vfi(vport); + else + lpfc_initial_flogi(vport); + } else { lpfc_initial_fdisc(vport); + } break; } } else { diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 194a14d5f8a9..180b072beef6 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -313,8 +313,7 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp) ndlp->nlp_state, ndlp->nlp_rpi); } - if (!(vport->load_flag & FC_UNLOADING) && - !(ndlp->nlp_flag & NLP_DELAY_TMO) && + if (!(ndlp->nlp_flag & NLP_DELAY_TMO) && !(ndlp->nlp_flag & NLP_NPR_2B_DISC) && (ndlp->nlp_state != NLP_STE_UNMAPPED_NODE) && (ndlp->nlp_state != NLP_STE_REG_LOGIN_ISSUE) && @@ -641,6 +640,8 @@ lpfc_work_done(struct lpfc_hba *phba) lpfc_handle_rrq_active(phba); if (phba->hba_flag & FCP_XRI_ABORT_EVENT) lpfc_sli4_fcp_xri_abort_event_proc(phba); + if (phba->hba_flag & NVME_XRI_ABORT_EVENT) + lpfc_sli4_nvme_xri_abort_event_proc(phba); if (phba->hba_flag & ELS_XRI_ABORT_EVENT) lpfc_sli4_els_xri_abort_event_proc(phba); if (phba->hba_flag & ASYNC_EVENT) @@ -2173,7 +2174,7 @@ lpfc_mbx_cmpl_fcf_scan_read_fcf_rec(struct lpfc_hba *phba, LPFC_MBOXQ_t *mboxq) uint32_t boot_flag, addr_mode; uint16_t fcf_index, next_fcf_index; struct lpfc_fcf_rec *fcf_rec = NULL; - uint16_t vlan_id; + uint16_t vlan_id = LPFC_FCOE_NULL_VID; bool select_new_fcf; int rc; @@ -4020,9 +4021,11 @@ lpfc_register_remote_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) rdata = rport->dd_data; /* break the link before dropping the ref */ ndlp->rport = NULL; - if (rdata && rdata->pnode == ndlp) - lpfc_nlp_put(ndlp); - rdata->pnode = NULL; + if (rdata) { + if (rdata->pnode == ndlp) + lpfc_nlp_put(ndlp); + rdata->pnode = NULL; + } /* drop reference for earlier registeration */ put_device(&rport->dev); } @@ -4344,9 +4347,8 @@ lpfc_initialize_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp, { INIT_LIST_HEAD(&ndlp->els_retry_evt.evt_listp); INIT_LIST_HEAD(&ndlp->dev_loss_evt.evt_listp); - init_timer(&ndlp->nlp_delayfunc); - ndlp->nlp_delayfunc.function = lpfc_els_retry_delay; - ndlp->nlp_delayfunc.data = (unsigned long)ndlp; + setup_timer(&ndlp->nlp_delayfunc, lpfc_els_retry_delay, + (unsigned long)ndlp); ndlp->nlp_DID = did; ndlp->vport = vport; ndlp->phba = vport->phba; @@ -4606,9 +4608,9 @@ lpfc_sli4_dequeue_nport_iocbs(struct lpfc_hba *phba, pring = qp->pring; if (!pring) continue; - spin_lock_irq(&pring->ring_lock); + spin_lock(&pring->ring_lock); __lpfc_dequeue_nport_iocbs(phba, ndlp, pring, dequeue_list); - spin_unlock_irq(&pring->ring_lock); + spin_unlock(&pring->ring_lock); } spin_unlock_irq(&phba->hbalock); } diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h index cfdb068a3bfc..15277705cb6b 100644 --- a/drivers/scsi/lpfc/lpfc_hw4.h +++ b/drivers/scsi/lpfc/lpfc_hw4.h @@ -1001,7 +1001,7 @@ struct eq_delay_info { uint32_t phase; uint32_t delay_multi; }; -#define LPFC_MAX_EQ_DELAY 8 +#define LPFC_MAX_EQ_DELAY_EQID_CNT 8 struct sgl_page_pairs { uint32_t sgl_pg0_addr_lo; @@ -1070,7 +1070,7 @@ struct lpfc_mbx_modify_eq_delay { union { struct { uint32_t num_eq; - struct eq_delay_info eq[LPFC_MAX_EQ_DELAY]; + struct eq_delay_info eq[LPFC_MAX_EQ_DELAY_EQID_CNT]; } request; struct { uint32_t word0; diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 0ee429d773f3..2697d49da4d7 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -3555,6 +3555,44 @@ out_free_mem: return rc; } +static uint64_t +lpfc_get_wwpn(struct lpfc_hba *phba) +{ + uint64_t wwn; + int rc; + LPFC_MBOXQ_t *mboxq; + MAILBOX_t *mb; + + + mboxq = (LPFC_MBOXQ_t *) mempool_alloc(phba->mbox_mem_pool, + GFP_KERNEL); + if (!mboxq) + return (uint64_t)-1; + + /* First get WWN of HBA instance */ + lpfc_read_nv(phba, mboxq); + rc = lpfc_sli_issue_mbox(phba, mboxq, MBX_POLL); + if (rc != MBX_SUCCESS) { + lpfc_printf_log(phba, KERN_ERR, LOG_SLI, + "6019 Mailbox failed , mbxCmd x%x " + "READ_NV, mbxStatus x%x\n", + bf_get(lpfc_mqe_command, &mboxq->u.mqe), + bf_get(lpfc_mqe_status, &mboxq->u.mqe)); + mempool_free(mboxq, phba->mbox_mem_pool); + return (uint64_t) -1; + } + mb = &mboxq->u.mb; + memcpy(&wwn, (char *)mb->un.varRDnvp.portname, sizeof(uint64_t)); + /* wwn is WWPN of HBA instance */ + mempool_free(mboxq, phba->mbox_mem_pool); + if (phba->sli_rev == LPFC_SLI_REV4) + return be64_to_cpu(wwn); + else + return (((wwn & 0xffffffff00000000) >> 32) | + ((wwn & 0x00000000ffffffff) << 32)); + +} + /** * lpfc_sli4_nvme_sgl_update - update xri-sgl sizing and mapping * @phba: pointer to lpfc hba data structure. @@ -3676,17 +3714,32 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) struct lpfc_vport *vport; struct Scsi_Host *shost = NULL; int error = 0; + int i; + uint64_t wwn; + bool use_no_reset_hba = false; + + wwn = lpfc_get_wwpn(phba); + + for (i = 0; i < lpfc_no_hba_reset_cnt; i++) { + if (wwn == lpfc_no_hba_reset[i]) { + lpfc_printf_log(phba, KERN_ERR, LOG_SLI, + "6020 Setting use_no_reset port=%llx\n", + wwn); + use_no_reset_hba = true; + break; + } + } if (phba->cfg_enable_fc4_type & LPFC_ENABLE_FCP) { if (dev != &phba->pcidev->dev) { shost = scsi_host_alloc(&lpfc_vport_template, sizeof(struct lpfc_vport)); } else { - if (phba->sli_rev == LPFC_SLI_REV4) + if (!use_no_reset_hba) shost = scsi_host_alloc(&lpfc_template, sizeof(struct lpfc_vport)); else - shost = scsi_host_alloc(&lpfc_template_s3, + shost = scsi_host_alloc(&lpfc_template_no_hr, sizeof(struct lpfc_vport)); } } else if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) { @@ -3734,17 +3787,14 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) INIT_LIST_HEAD(&vport->rcv_buffer_list); spin_lock_init(&vport->work_port_lock); - init_timer(&vport->fc_disctmo); - vport->fc_disctmo.function = lpfc_disc_timeout; - vport->fc_disctmo.data = (unsigned long)vport; + setup_timer(&vport->fc_disctmo, lpfc_disc_timeout, + (unsigned long)vport); - init_timer(&vport->els_tmofunc); - vport->els_tmofunc.function = lpfc_els_timeout; - vport->els_tmofunc.data = (unsigned long)vport; + setup_timer(&vport->els_tmofunc, lpfc_els_timeout, + (unsigned long)vport); - init_timer(&vport->delayed_disc_tmo); - vport->delayed_disc_tmo.function = lpfc_delayed_disc_tmo; - vport->delayed_disc_tmo.data = (unsigned long)vport; + setup_timer(&vport->delayed_disc_tmo, lpfc_delayed_disc_tmo, + (unsigned long)vport); error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev); if (error) @@ -5406,21 +5456,15 @@ lpfc_setup_driver_resource_phase1(struct lpfc_hba *phba) INIT_LIST_HEAD(&phba->luns); /* MBOX heartbeat timer */ - init_timer(&psli->mbox_tmo); - psli->mbox_tmo.function = lpfc_mbox_timeout; - psli->mbox_tmo.data = (unsigned long) phba; + setup_timer(&psli->mbox_tmo, lpfc_mbox_timeout, (unsigned long)phba); /* Fabric block timer */ - init_timer(&phba->fabric_block_timer); - phba->fabric_block_timer.function = lpfc_fabric_block_timeout; - phba->fabric_block_timer.data = (unsigned long) phba; + setup_timer(&phba->fabric_block_timer, lpfc_fabric_block_timeout, + (unsigned long)phba); /* EA polling mode timer */ - init_timer(&phba->eratt_poll); - phba->eratt_poll.function = lpfc_poll_eratt; - phba->eratt_poll.data = (unsigned long) phba; + setup_timer(&phba->eratt_poll, lpfc_poll_eratt, + (unsigned long)phba); /* Heartbeat timer */ - init_timer(&phba->hb_tmofunc); - phba->hb_tmofunc.function = lpfc_hb_timeout; - phba->hb_tmofunc.data = (unsigned long)phba; + setup_timer(&phba->hb_tmofunc, lpfc_hb_timeout, (unsigned long)phba); return 0; } @@ -5446,9 +5490,8 @@ lpfc_sli_driver_resource_setup(struct lpfc_hba *phba) */ /* FCP polling mode timer */ - init_timer(&phba->fcp_poll_timer); - phba->fcp_poll_timer.function = lpfc_poll_timeout; - phba->fcp_poll_timer.data = (unsigned long) phba; + setup_timer(&phba->fcp_poll_timer, lpfc_poll_timeout, + (unsigned long)phba); /* Host attention work mask setup */ phba->work_ha_mask = (HA_ERATT | HA_MBATT | HA_LATT); @@ -5482,7 +5525,8 @@ lpfc_sli_driver_resource_setup(struct lpfc_hba *phba) /* Initialize the host templates the configured values. */ lpfc_vport_template.sg_tablesize = phba->cfg_sg_seg_cnt; - lpfc_template_s3.sg_tablesize = phba->cfg_sg_seg_cnt; + lpfc_template_no_hr.sg_tablesize = phba->cfg_sg_seg_cnt; + lpfc_template.sg_tablesize = phba->cfg_sg_seg_cnt; /* There are going to be 2 reserved BDEs: 1 FCP cmnd + 1 FCP rsp */ if (phba->cfg_enable_bg) { @@ -5617,14 +5661,11 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) * Initialize timers used by driver */ - init_timer(&phba->rrq_tmr); - phba->rrq_tmr.function = lpfc_rrq_timeout; - phba->rrq_tmr.data = (unsigned long)phba; + setup_timer(&phba->rrq_tmr, lpfc_rrq_timeout, (unsigned long)phba); /* FCF rediscover timer */ - init_timer(&phba->fcf.redisc_wait); - phba->fcf.redisc_wait.function = lpfc_sli4_fcf_redisc_wait_tmo; - phba->fcf.redisc_wait.data = (unsigned long)phba; + setup_timer(&phba->fcf.redisc_wait, lpfc_sli4_fcf_redisc_wait_tmo, + (unsigned long)phba); /* * Control structure for handling external multi-buffer mailbox @@ -5706,6 +5747,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) /* Initialize the host templates with the updated values. */ lpfc_vport_template.sg_tablesize = phba->cfg_sg_seg_cnt; lpfc_template.sg_tablesize = phba->cfg_sg_seg_cnt; + lpfc_template_no_hr.sg_tablesize = phba->cfg_sg_seg_cnt; if (phba->cfg_sg_dma_buf_size <= LPFC_MIN_SG_SLI4_BUF_SZ) phba->cfg_sg_dma_buf_size = LPFC_MIN_SG_SLI4_BUF_SZ; @@ -5736,6 +5778,8 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) /* Initialize the Abort nvme buffer list used by driver */ spin_lock_init(&phba->sli4_hba.abts_nvme_buf_list_lock); INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvme_buf_list); + /* Fast-path XRI aborted CQ Event work queue list */ + INIT_LIST_HEAD(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue); } /* This abort list used by worker thread */ @@ -8712,12 +8756,9 @@ lpfc_sli4_queue_setup(struct lpfc_hba *phba) } } - /* - * Configure EQ delay multipier for interrupt coalescing using - * MODIFY_EQ_DELAY for all EQs created, LPFC_MAX_EQ_DELAY at a time. - */ - for (qidx = 0; qidx < io_channel; qidx += LPFC_MAX_EQ_DELAY) + for (qidx = 0; qidx < io_channel; qidx += LPFC_MAX_EQ_DELAY_EQID_CNT) lpfc_modify_hba_eq_delay(phba, qidx); + return 0; out_destroy: @@ -8973,6 +9014,11 @@ lpfc_sli4_cq_event_release_all(struct lpfc_hba *phba) /* Pending ELS XRI abort events */ list_splice_init(&phba->sli4_hba.sp_els_xri_aborted_work_queue, &cqelist); + if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) { + /* Pending NVME XRI abort events */ + list_splice_init(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue, + &cqelist); + } /* Pending asynnc events */ list_splice_init(&phba->sli4_hba.sp_asynce_work_queue, &cqelist); @@ -10400,12 +10446,7 @@ lpfc_pci_remove_one_s3(struct pci_dev *pdev) fc_remove_host(shost); scsi_remove_host(shost); - /* Perform ndlp cleanup on the physical port. The nvme and nvmet - * localports are destroyed after to cleanup all transport memory. - */ lpfc_cleanup(vport); - lpfc_nvmet_destroy_targetport(phba); - lpfc_nvme_destroy_localport(vport); /* * Bring down the SLI Layer. This step disable all interrupts, @@ -12018,6 +12059,7 @@ static struct pci_driver lpfc_driver = { .id_table = lpfc_id_table, .probe = lpfc_pci_probe_one, .remove = lpfc_pci_remove_one, + .shutdown = lpfc_pci_remove_one, .suspend = lpfc_pci_suspend_one, .resume = lpfc_pci_resume_one, .err_handler = &lpfc_err_handler, diff --git a/drivers/scsi/lpfc/lpfc_mem.c b/drivers/scsi/lpfc/lpfc_mem.c index c61d8d692ede..5986c7957199 100644 --- a/drivers/scsi/lpfc/lpfc_mem.c +++ b/drivers/scsi/lpfc/lpfc_mem.c @@ -646,7 +646,6 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) } dma_buf->iocbq = lpfc_sli_get_iocbq(phba); - dma_buf->iocbq->iocb_flag = LPFC_IO_NVMET; if (!dma_buf->iocbq) { kfree(dma_buf->context); pci_pool_free(phba->lpfc_drb_pool, dma_buf->dbuf.virt, @@ -658,6 +657,7 @@ lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba) "2621 Ran out of nvmet iocb/WQEs\n"); return NULL; } + dma_buf->iocbq->iocb_flag = LPFC_IO_NVMET; nvmewqe = dma_buf->iocbq; wqe = (union lpfc_wqe128 *)&nvmewqe->wqe; /* Initialize WQE */ diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 609a908ea9db..0a4c19081409 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -316,7 +316,7 @@ lpfc_nvme_gen_req(struct lpfc_vport *vport, struct lpfc_dmabuf *bmp, bf_set(wqe_dfctl, &wqe->gen_req.wge_ctl, 0); bf_set(wqe_si, &wqe->gen_req.wge_ctl, 1); bf_set(wqe_la, &wqe->gen_req.wge_ctl, 1); - bf_set(wqe_rctl, &wqe->gen_req.wge_ctl, FC_RCTL_DD_UNSOL_CTL); + bf_set(wqe_rctl, &wqe->gen_req.wge_ctl, FC_RCTL_ELS4_REQ); bf_set(wqe_type, &wqe->gen_req.wge_ctl, FC_TYPE_NVME); /* Word 6 */ @@ -620,15 +620,15 @@ lpfc_nvme_adj_fcp_sgls(struct lpfc_vport *vport, * Embed the payload in the last half of the WQE * WQE words 16-30 get the NVME CMD IU payload * - * WQE Word 16 is already setup with flags - * WQE words 17-19 get payload Words 2-4 + * WQE words 16-19 get payload Words 1-4 * WQE words 20-21 get payload Words 6-7 * WQE words 22-29 get payload Words 16-23 */ - wptr = &wqe->words[17]; /* WQE ptr */ + wptr = &wqe->words[16]; /* WQE ptr */ dptr = (uint32_t *)nCmd->cmdaddr; /* payload ptr */ - dptr += 2; /* Skip Words 0-1 in payload */ + dptr++; /* Skip Word 0 in payload */ + *wptr++ = *dptr++; /* Word 1 */ *wptr++ = *dptr++; /* Word 2 */ *wptr++ = *dptr++; /* Word 3 */ *wptr++ = *dptr++; /* Word 4 */ @@ -978,9 +978,6 @@ lpfc_nvme_prep_io_cmd(struct lpfc_vport *vport, bf_set(wqe_cmd_type, &wqe->generic.wqe_com, NVME_WRITE_CMD); - /* Word 16 */ - wqe->words[16] = LPFC_NVME_EMBED_WRITE; - phba->fc4NvmeOutputRequests++; } else { /* Word 7 */ @@ -1002,9 +999,6 @@ lpfc_nvme_prep_io_cmd(struct lpfc_vport *vport, bf_set(wqe_cmd_type, &wqe->generic.wqe_com, NVME_READ_CMD); - /* Word 16 */ - wqe->words[16] = LPFC_NVME_EMBED_READ; - phba->fc4NvmeInputRequests++; } } else { @@ -1026,9 +1020,6 @@ lpfc_nvme_prep_io_cmd(struct lpfc_vport *vport, /* Word 11 */ bf_set(wqe_cmd_type, &wqe->generic.wqe_com, NVME_READ_CMD); - /* Word 16 */ - wqe->words[16] = LPFC_NVME_EMBED_CMD; - phba->fc4NvmeControlRequests++; } /* @@ -1286,6 +1277,7 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport, pnvme_fcreq->private = (void *)lpfc_ncmd; lpfc_ncmd->nvmeCmd = pnvme_fcreq; lpfc_ncmd->nrport = rport; + lpfc_ncmd->ndlp = ndlp; lpfc_ncmd->start_time = jiffies; lpfc_nvme_prep_io_cmd(vport, lpfc_ncmd, ndlp); @@ -1319,7 +1311,7 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport, "sid: x%x did: x%x oxid: x%x\n", ret, vport->fc_myDID, ndlp->nlp_DID, lpfc_ncmd->cur_iocbq.sli4_xritag); - ret = -EINVAL; + ret = -EBUSY; goto out_free_nvme_buf; } @@ -1821,10 +1813,10 @@ lpfc_post_nvme_sgl_list(struct lpfc_hba *phba, pdma_phys_sgl1, cur_xritag); if (status) { /* failure, put on abort nvme list */ - lpfc_ncmd->exch_busy = 1; + lpfc_ncmd->flags |= LPFC_SBUF_XBUSY; } else { /* success, put on NVME buffer list */ - lpfc_ncmd->exch_busy = 0; + lpfc_ncmd->flags &= ~LPFC_SBUF_XBUSY; lpfc_ncmd->status = IOSTAT_SUCCESS; num_posted++; } @@ -1854,10 +1846,10 @@ lpfc_post_nvme_sgl_list(struct lpfc_hba *phba, struct lpfc_nvme_buf, list); if (status) { /* failure, put on abort nvme list */ - lpfc_ncmd->exch_busy = 1; + lpfc_ncmd->flags |= LPFC_SBUF_XBUSY; } else { /* success, put on NVME buffer list */ - lpfc_ncmd->exch_busy = 0; + lpfc_ncmd->flags &= ~LPFC_SBUF_XBUSY; lpfc_ncmd->status = IOSTAT_SUCCESS; num_posted++; } @@ -2099,7 +2091,7 @@ lpfc_release_nvme_buf(struct lpfc_hba *phba, struct lpfc_nvme_buf *lpfc_ncmd) unsigned long iflag = 0; lpfc_ncmd->nonsg_phys = 0; - if (lpfc_ncmd->exch_busy) { + if (lpfc_ncmd->flags & LPFC_SBUF_XBUSY) { spin_lock_irqsave(&phba->sli4_hba.abts_nvme_buf_list_lock, iflag); lpfc_ncmd->nvmeCmd = NULL; @@ -2135,11 +2127,12 @@ lpfc_release_nvme_buf(struct lpfc_hba *phba, struct lpfc_nvme_buf *lpfc_ncmd) int lpfc_nvme_create_localport(struct lpfc_vport *vport) { + int ret = 0; struct lpfc_hba *phba = vport->phba; struct nvme_fc_port_info nfcp_info; struct nvme_fc_local_port *localport; struct lpfc_nvme_lport *lport; - int len, ret = 0; + int len; /* Initialize this localport instance. The vport wwn usage ensures * that NPIV is accounted for. @@ -2156,8 +2149,12 @@ lpfc_nvme_create_localport(struct lpfc_vport *vport) /* localport is allocated from the stack, but the registration * call allocates heap memory as well as the private area. */ +#ifdef CONFIG_LPFC_NVME_INITIATOR ret = nvme_fc_register_localport(&nfcp_info, &lpfc_nvme_template, &vport->phba->pcidev->dev, &localport); +#else + ret = -ENOMEM; +#endif if (!ret) { lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME | LOG_NVME_DISC, "6005 Successfully registered local " @@ -2173,10 +2170,10 @@ lpfc_nvme_create_localport(struct lpfc_vport *vport) lport->vport = vport; INIT_LIST_HEAD(&lport->rport_list); vport->nvmei_support = 1; + len = lpfc_new_nvme_buf(vport, phba->sli4_hba.nvme_xri_max); + vport->phba->total_nvme_bufs += len; } - len = lpfc_new_nvme_buf(vport, phba->sli4_hba.nvme_xri_max); - vport->phba->total_nvme_bufs += len; return ret; } @@ -2193,6 +2190,7 @@ lpfc_nvme_create_localport(struct lpfc_vport *vport) void lpfc_nvme_destroy_localport(struct lpfc_vport *vport) { +#ifdef CONFIG_LPFC_NVME_INITIATOR struct nvme_fc_local_port *localport; struct lpfc_nvme_lport *lport; struct lpfc_nvme_rport *rport = NULL, *rport_next = NULL; @@ -2208,7 +2206,6 @@ lpfc_nvme_destroy_localport(struct lpfc_vport *vport) lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME, "6011 Destroying NVME localport %p\n", localport); - list_for_each_entry_safe(rport, rport_next, &lport->rport_list, list) { /* The last node ref has to get released now before the rport * private memory area is released by the transport. @@ -2222,6 +2219,7 @@ lpfc_nvme_destroy_localport(struct lpfc_vport *vport) "6008 rport fail destroy %x\n", ret); wait_for_completion_timeout(&rport->rport_unreg_done, 5); } + /* lport's rport list is clear. Unregister * lport and release resources. */ @@ -2245,6 +2243,7 @@ lpfc_nvme_destroy_localport(struct lpfc_vport *vport) "Failed, status x%x\n", ret); } +#endif } void @@ -2275,6 +2274,7 @@ lpfc_nvme_update_localport(struct lpfc_vport *vport) int lpfc_nvme_register_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) { +#ifdef CONFIG_LPFC_NVME_INITIATOR int ret = 0; struct nvme_fc_local_port *localport; struct lpfc_nvme_lport *lport; @@ -2348,7 +2348,6 @@ lpfc_nvme_register_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) rpinfo.port_role |= FC_PORT_ROLE_NVME_INITIATOR; rpinfo.port_name = wwn_to_u64(ndlp->nlp_portname.u.wwn); rpinfo.node_name = wwn_to_u64(ndlp->nlp_nodename.u.wwn); - ret = nvme_fc_register_remoteport(localport, &rpinfo, &remote_port); if (!ret) { @@ -2384,6 +2383,9 @@ lpfc_nvme_register_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) ndlp->nlp_type, ndlp->nlp_DID, ndlp); } return ret; +#else + return 0; +#endif } /* lpfc_nvme_unregister_port - unbind the DID and port_role from this rport. @@ -2401,6 +2403,7 @@ lpfc_nvme_register_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) void lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) { +#ifdef CONFIG_LPFC_NVME_INITIATOR int ret; struct nvme_fc_local_port *localport; struct lpfc_nvme_lport *lport; @@ -2458,7 +2461,61 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) return; input_err: +#endif lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC, "6168: State error: lport %p, rport%p FCID x%06x\n", vport->localport, ndlp->rport, ndlp->nlp_DID); } + +/** + * lpfc_sli4_nvme_xri_aborted - Fast-path process of NVME xri abort + * @phba: pointer to lpfc hba data structure. + * @axri: pointer to the fcp xri abort wcqe structure. + * + * This routine is invoked by the worker thread to process a SLI4 fast-path + * FCP aborted xri. + **/ +void +lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba, + struct sli4_wcqe_xri_aborted *axri) +{ + uint16_t xri = bf_get(lpfc_wcqe_xa_xri, axri); + uint16_t rxid = bf_get(lpfc_wcqe_xa_remote_xid, axri); + struct lpfc_nvme_buf *lpfc_ncmd, *next_lpfc_ncmd; + struct lpfc_nodelist *ndlp; + unsigned long iflag = 0; + int rrq_empty = 0; + + if (!(phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)) + return; + spin_lock_irqsave(&phba->hbalock, iflag); + spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock); + list_for_each_entry_safe(lpfc_ncmd, next_lpfc_ncmd, + &phba->sli4_hba.lpfc_abts_nvme_buf_list, + list) { + if (lpfc_ncmd->cur_iocbq.sli4_xritag == xri) { + list_del(&lpfc_ncmd->list); + lpfc_ncmd->flags &= ~LPFC_SBUF_XBUSY; + lpfc_ncmd->status = IOSTAT_SUCCESS; + spin_unlock( + &phba->sli4_hba.abts_nvme_buf_list_lock); + + rrq_empty = list_empty(&phba->active_rrq_list); + spin_unlock_irqrestore(&phba->hbalock, iflag); + ndlp = lpfc_ncmd->ndlp; + if (ndlp) { + lpfc_set_rrq_active( + phba, ndlp, + lpfc_ncmd->cur_iocbq.sli4_lxritag, + rxid, 1); + lpfc_sli4_abts_err_handler(phba, ndlp, axri); + } + lpfc_release_nvme_buf(phba, lpfc_ncmd); + if (rrq_empty) + lpfc_worker_wake_up(phba); + return; + } + } + spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock); + spin_unlock_irqrestore(&phba->hbalock, iflag); +} diff --git a/drivers/scsi/lpfc/lpfc_nvme.h b/drivers/scsi/lpfc/lpfc_nvme.h index b2fae5e813f8..1347deb8dd6c 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.h +++ b/drivers/scsi/lpfc/lpfc_nvme.h @@ -57,6 +57,7 @@ struct lpfc_nvme_buf { struct list_head list; struct nvmefc_fcp_req *nvmeCmd; struct lpfc_nvme_rport *nrport; + struct lpfc_nodelist *ndlp; uint32_t timeout; diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index c421e1738ee9..b7739a554fe0 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -571,6 +571,7 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport, lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, "6102 Bad state IO x%x aborted\n", ctxp->oxid); + rc = -ENXIO; goto aerr; } @@ -580,6 +581,7 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport, lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, "6152 FCP Drop IO x%x: Prep\n", ctxp->oxid); + rc = -ENXIO; goto aerr; } @@ -618,8 +620,9 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport, ctxp->wqeq->hba_wqidx = 0; nvmewqeq->context2 = NULL; nvmewqeq->context3 = NULL; + rc = -EBUSY; aerr: - return -ENXIO; + return rc; } static void @@ -668,9 +671,13 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba) lpfc_tgttemplate.target_features = NVMET_FCTGTFEAT_READDATA_RSP | NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED; +#ifdef CONFIG_LPFC_NVME_TARGET error = nvmet_fc_register_targetport(&pinfo, &lpfc_tgttemplate, &phba->pcidev->dev, &phba->targetport); +#else + error = -ENOMEM; +#endif if (error) { lpfc_printf_log(phba, KERN_ERR, LOG_NVME_DISC, "6025 Cannot register NVME targetport " @@ -731,9 +738,25 @@ lpfc_nvmet_update_targetport(struct lpfc_hba *phba) return 0; } +/** + * lpfc_sli4_nvmet_xri_aborted - Fast-path process of nvmet xri abort + * @phba: pointer to lpfc hba data structure. + * @axri: pointer to the nvmet xri abort wcqe structure. + * + * This routine is invoked by the worker thread to process a SLI4 fast-path + * NVMET aborted xri. + **/ +void +lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, + struct sli4_wcqe_xri_aborted *axri) +{ + /* TODO: work in progress */ +} + void lpfc_nvmet_destroy_targetport(struct lpfc_hba *phba) { +#ifdef CONFIG_LPFC_NVME_TARGET struct lpfc_nvmet_tgtport *tgtp; if (phba->nvmet_support == 0) @@ -745,6 +768,7 @@ lpfc_nvmet_destroy_targetport(struct lpfc_hba *phba) wait_for_completion_timeout(&tgtp->tport_unreg_done, 5); } phba->targetport = NULL; +#endif } /** @@ -764,6 +788,7 @@ static void lpfc_nvmet_unsol_ls_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, struct hbq_dmabuf *nvmebuf) { +#ifdef CONFIG_LPFC_NVME_TARGET struct lpfc_nvmet_tgtport *tgtp; struct fc_frame_header *fc_hdr; struct lpfc_nvmet_rcv_ctx *ctxp; @@ -844,6 +869,7 @@ dropit: atomic_inc(&tgtp->xmt_ls_abort); lpfc_nvmet_unsol_ls_issue_abort(phba, ctxp, sid, oxid); +#endif } /** @@ -865,6 +891,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, struct rqb_dmabuf *nvmebuf, uint64_t isr_timestamp) { +#ifdef CONFIG_LPFC_NVME_TARGET struct lpfc_nvmet_rcv_ctx *ctxp; struct lpfc_nvmet_tgtport *tgtp; struct fc_frame_header *fc_hdr; @@ -955,7 +982,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba, atomic_inc(&tgtp->rcv_fcp_cmd_drop); lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR, - "6159 FCP Drop IO x%x: nvmet_fc_rcv_fcp_req x%x\n", + "6159 FCP Drop IO x%x: err x%x\n", ctxp->oxid, rc); dropit: lpfc_nvmeio_data(phba, "NVMET FCP DROP: xri x%x sz %d from %06x\n", @@ -970,6 +997,7 @@ dropit: /* We assume a rcv'ed cmd ALWAYs fits into 1 buffer */ lpfc_nvmet_rq_post(phba, NULL, &nvmebuf->hbuf); } +#endif } /** @@ -1114,7 +1142,7 @@ lpfc_nvmet_prep_ls_wqe(struct lpfc_hba *phba, bf_set(wqe_dfctl, &wqe->xmit_sequence.wge_ctl, 0); bf_set(wqe_ls, &wqe->xmit_sequence.wge_ctl, 1); bf_set(wqe_la, &wqe->xmit_sequence.wge_ctl, 0); - bf_set(wqe_rctl, &wqe->xmit_sequence.wge_ctl, FC_RCTL_DD_SOL_CTL); + bf_set(wqe_rctl, &wqe->xmit_sequence.wge_ctl, FC_RCTL_ELS4_REP); bf_set(wqe_type, &wqe->xmit_sequence.wge_ctl, FC_TYPE_NVME); /* Word 6 */ @@ -1445,7 +1473,6 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba, case NVMET_FCOP_RSP: /* Words 0 - 2 */ - sgel = &rsp->sg[0]; physaddr = rsp->rspdma; wqe->fcp_trsp.bde.tus.f.bdeFlags = BUFF_TYPE_BDE_64; wqe->fcp_trsp.bde.tus.f.bdeSize = rsp->rsplen; @@ -1681,8 +1708,8 @@ lpfc_nvmet_unsol_issue_abort(struct lpfc_hba *phba, struct lpfc_nodelist *ndlp; lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS, - "6067 %s: Entrypoint: sid %x xri %x\n", __func__, - sid, xri); + "6067 Abort: sid %x xri x%x/x%x\n", + sid, xri, ctxp->wqeq->sli4_xritag); tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private; @@ -1693,7 +1720,7 @@ lpfc_nvmet_unsol_issue_abort(struct lpfc_hba *phba, atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS, "6134 Drop ABTS - wrong NDLP state x%x.\n", - ndlp->nlp_state); + (ndlp) ? ndlp->nlp_state : NLP_STE_MAX_STATE); /* No failure to an ABTS request. */ return 0; @@ -1791,7 +1818,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba, atomic_inc(&tgtp->xmt_abort_rsp_error); lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS, "6160 Drop ABTS - wrong NDLP state x%x.\n", - ndlp->nlp_state); + (ndlp) ? ndlp->nlp_state : NLP_STE_MAX_STATE); /* No failure to an ABTS request. */ return 0; diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 9d6384af9fce..54fd0c81ceaf 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -5953,12 +5953,13 @@ struct scsi_host_template lpfc_template_nvme = { .track_queue_depth = 0, }; -struct scsi_host_template lpfc_template_s3 = { +struct scsi_host_template lpfc_template_no_hr = { .module = THIS_MODULE, .name = LPFC_DRIVER_NAME, .proc_name = LPFC_DRIVER_NAME, .info = lpfc_info, .queuecommand = lpfc_queuecommand, + .eh_timed_out = fc_eh_timed_out, .eh_abort_handler = lpfc_abort_handler, .eh_device_reset_handler = lpfc_device_reset_handler, .eh_target_reset_handler = lpfc_target_reset_handler, @@ -6015,7 +6016,6 @@ struct scsi_host_template lpfc_vport_template = { .eh_abort_handler = lpfc_abort_handler, .eh_device_reset_handler = lpfc_device_reset_handler, .eh_target_reset_handler = lpfc_target_reset_handler, - .eh_bus_reset_handler = lpfc_bus_reset_handler, .slave_alloc = lpfc_slave_alloc, .slave_configure = lpfc_slave_configure, .slave_destroy = lpfc_slave_destroy, diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index e43e5e23c24b..1c9fa45df7eb 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -1,3 +1,4 @@ + /******************************************************************* * This file is part of the Emulex Linux Device Driver for * * Fibre Channel Host Bus Adapters. * @@ -952,7 +953,7 @@ __lpfc_sli_get_els_sglq(struct lpfc_hba *phba, struct lpfc_iocbq *piocbq) start_sglq = sglq; while (!found) { if (!sglq) - return NULL; + break; if (ndlp && ndlp->active_rrqs_xri_bitmap && test_bit(sglq->sli4_lxritag, ndlp->active_rrqs_xri_bitmap)) { @@ -12213,6 +12214,41 @@ void lpfc_sli4_fcp_xri_abort_event_proc(struct lpfc_hba *phba) } /** + * lpfc_sli4_nvme_xri_abort_event_proc - Process nvme xri abort event + * @phba: pointer to lpfc hba data structure. + * + * This routine is invoked by the worker thread to process all the pending + * SLI4 NVME abort XRI events. + **/ +void lpfc_sli4_nvme_xri_abort_event_proc(struct lpfc_hba *phba) +{ + struct lpfc_cq_event *cq_event; + + /* First, declare the fcp xri abort event has been handled */ + spin_lock_irq(&phba->hbalock); + phba->hba_flag &= ~NVME_XRI_ABORT_EVENT; + spin_unlock_irq(&phba->hbalock); + /* Now, handle all the fcp xri abort events */ + while (!list_empty(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue)) { + /* Get the first event from the head of the event queue */ + spin_lock_irq(&phba->hbalock); + list_remove_head(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue, + cq_event, struct lpfc_cq_event, list); + spin_unlock_irq(&phba->hbalock); + /* Notify aborted XRI for NVME work queue */ + if (phba->nvmet_support) { + lpfc_sli4_nvmet_xri_aborted(phba, + &cq_event->cqe.wcqe_axri); + } else { + lpfc_sli4_nvme_xri_aborted(phba, + &cq_event->cqe.wcqe_axri); + } + /* Free the event processed back to the free pool */ + lpfc_sli4_cq_event_release(phba, cq_event); + } +} + +/** * lpfc_sli4_els_xri_abort_event_proc - Process els xri abort event * @phba: pointer to lpfc hba data structure. * @@ -12709,10 +12745,22 @@ lpfc_sli4_sp_handle_abort_xri_wcqe(struct lpfc_hba *phba, spin_unlock_irqrestore(&phba->hbalock, iflags); workposted = true; break; + case LPFC_NVME: + spin_lock_irqsave(&phba->hbalock, iflags); + list_add_tail(&cq_event->list, + &phba->sli4_hba.sp_nvme_xri_aborted_work_queue); + /* Set the nvme xri abort event flag */ + phba->hba_flag |= NVME_XRI_ABORT_EVENT; + spin_unlock_irqrestore(&phba->hbalock, iflags); + workposted = true; + break; default: lpfc_printf_log(phba, KERN_ERR, LOG_SLI, - "0603 Invalid work queue CQE subtype (x%x)\n", - cq->subtype); + "0603 Invalid CQ subtype %d: " + "%08x %08x %08x %08x\n", + cq->subtype, wcqe->word0, wcqe->parameter, + wcqe->word2, wcqe->word3); + lpfc_sli4_cq_event_release(phba, cq_event); workposted = false; break; } @@ -13827,6 +13875,8 @@ lpfc_dual_chute_pci_bar_map(struct lpfc_hba *phba, uint16_t pci_barset) * @startq: The starting FCP EQ to modify * * This function sends an MODIFY_EQ_DELAY mailbox command to the HBA. + * The command allows up to LPFC_MAX_EQ_DELAY_EQID_CNT EQ ID's to be + * updated in one mailbox command. * * The @phba struct is used to send mailbox command to HBA. The @startq * is used to get the starting FCP EQ to change. @@ -13879,7 +13929,7 @@ lpfc_modify_hba_eq_delay(struct lpfc_hba *phba, uint32_t startq) eq_delay->u.request.eq[cnt].phase = 0; eq_delay->u.request.eq[cnt].delay_multi = dmult; cnt++; - if (cnt >= LPFC_MAX_EQ_DELAY) + if (cnt >= LPFC_MAX_EQ_DELAY_EQID_CNT) break; } eq_delay->u.request.num_eq = cnt; @@ -15185,17 +15235,17 @@ lpfc_mrq_create(struct lpfc_hba *phba, struct lpfc_queue **hrqp, drq = drqp[idx]; cq = cqp[idx]; - if (hrq->entry_count != drq->entry_count) { - status = -EINVAL; - goto out; - } - /* sanity check on queue memory */ if (!hrq || !drq || !cq) { status = -ENODEV; goto out; } + if (hrq->entry_count != drq->entry_count) { + status = -EINVAL; + goto out; + } + if (idx == 0) { bf_set(lpfc_mbx_rq_create_num_pages, &rq_create->u.request, diff --git a/drivers/scsi/lpfc/lpfc_sli4.h b/drivers/scsi/lpfc/lpfc_sli4.h index 91153c9f6d18..710458cf11d6 100644 --- a/drivers/scsi/lpfc/lpfc_sli4.h +++ b/drivers/scsi/lpfc/lpfc_sli4.h @@ -642,6 +642,7 @@ struct lpfc_sli4_hba { struct list_head sp_asynce_work_queue; struct list_head sp_fcp_xri_aborted_work_queue; struct list_head sp_els_xri_aborted_work_queue; + struct list_head sp_nvme_xri_aborted_work_queue; struct list_head sp_unsol_work_queue; struct lpfc_sli4_link link_state; struct lpfc_sli4_lnk_info lnk_info; @@ -794,9 +795,14 @@ void lpfc_sli4_fcf_redisc_event_proc(struct lpfc_hba *); int lpfc_sli4_resume_rpi(struct lpfc_nodelist *, void (*)(struct lpfc_hba *, LPFC_MBOXQ_t *), void *); void lpfc_sli4_fcp_xri_abort_event_proc(struct lpfc_hba *); +void lpfc_sli4_nvme_xri_abort_event_proc(struct lpfc_hba *phba); void lpfc_sli4_els_xri_abort_event_proc(struct lpfc_hba *); void lpfc_sli4_fcp_xri_aborted(struct lpfc_hba *, struct sli4_wcqe_xri_aborted *); +void lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba, + struct sli4_wcqe_xri_aborted *axri); +void lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba, + struct sli4_wcqe_xri_aborted *axri); void lpfc_sli4_els_xri_aborted(struct lpfc_hba *, struct sli4_wcqe_xri_aborted *); void lpfc_sli4_vport_delete_els_xri_aborted(struct lpfc_vport *); diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h index 86c6c9b26b82..d4e95e28f4e3 100644 --- a/drivers/scsi/lpfc/lpfc_version.h +++ b/drivers/scsi/lpfc/lpfc_version.h @@ -20,7 +20,7 @@ * included with this package. * *******************************************************************/ -#define LPFC_DRIVER_VERSION "11.2.0.7" +#define LPFC_DRIVER_VERSION "11.2.0.10" #define LPFC_DRIVER_NAME "lpfc" /* Used for SLI 2/3 */ diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h index 7fe7e6ed595b..8981806fb13f 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.h +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h @@ -1442,9 +1442,6 @@ void mpt3sas_transport_update_links(struct MPT3SAS_ADAPTER *ioc, u64 sas_address, u16 handle, u8 phy_number, u8 link_rate); extern struct sas_function_template mpt3sas_transport_functions; extern struct scsi_transport_template *mpt3sas_transport_template; -extern int scsi_internal_device_block(struct scsi_device *sdev); -extern int scsi_internal_device_unblock(struct scsi_device *sdev, - enum scsi_device_state new_state); /* trigger data externs */ void mpt3sas_send_trigger_data_event(struct MPT3SAS_ADAPTER *ioc, struct SL_WH_TRIGGERS_EVENT_DATA_T *event_data); diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 46e866c36c8a..919ba2bb15f1 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2859,7 +2859,7 @@ _scsih_internal_device_block(struct scsi_device *sdev, sas_device_priv_data->sas_target->handle); sas_device_priv_data->block = 1; - r = scsi_internal_device_block(sdev); + r = scsi_internal_device_block(sdev, false); if (r == -EINVAL) sdev_printk(KERN_WARNING, sdev, "device_block failed with return(%d) for handle(0x%04x)\n", @@ -2895,7 +2895,7 @@ _scsih_internal_device_unblock(struct scsi_device *sdev, "performing a block followed by an unblock\n", r, sas_device_priv_data->sas_target->handle); sas_device_priv_data->block = 1; - r = scsi_internal_device_block(sdev); + r = scsi_internal_device_block(sdev, false); if (r) sdev_printk(KERN_WARNING, sdev, "retried device_block " "failed with return(%d) for handle(0x%04x)\n", @@ -4677,7 +4677,6 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply) struct MPT3SAS_DEVICE *sas_device_priv_data; u32 response_code = 0; unsigned long flags; - unsigned int sector_sz; mpi_reply = mpt3sas_base_get_reply_virt_addr(ioc, reply); @@ -4742,20 +4741,6 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply) } xfer_cnt = le32_to_cpu(mpi_reply->TransferCount); - - /* In case of bogus fw or device, we could end up having - * unaligned partial completion. We can force alignment here, - * then scsi-ml does not need to handle this misbehavior. - */ - sector_sz = scmd->device->sector_size; - if (unlikely(!blk_rq_is_passthrough(scmd->request) && sector_sz && - xfer_cnt % sector_sz)) { - sdev_printk(KERN_INFO, scmd->device, - "unaligned partial completion avoided (xfer_cnt=%u, sector_sz=%u)\n", - xfer_cnt, sector_sz); - xfer_cnt = round_down(xfer_cnt, sector_sz); - } - scsi_set_resid(scmd, scsi_bufflen(scmd) - xfer_cnt); if (ioc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE) log_info = le32_to_cpu(mpi_reply->IOCLogInfo); diff --git a/drivers/scsi/qedf/qedf_dbg.h b/drivers/scsi/qedf/qedf_dbg.h index 23bd70628a2f..7d173f48a81e 100644 --- a/drivers/scsi/qedf/qedf_dbg.h +++ b/drivers/scsi/qedf/qedf_dbg.h @@ -81,14 +81,17 @@ struct qedf_dbg_ctx { #define QEDF_INFO(pdev, level, fmt, ...) \ qedf_dbg_info(pdev, __func__, __LINE__, level, fmt, \ ## __VA_ARGS__) - -extern void qedf_dbg_err(struct qedf_dbg_ctx *qedf, const char *func, u32 line, +__printf(4, 5) +void qedf_dbg_err(struct qedf_dbg_ctx *qedf, const char *func, u32 line, const char *fmt, ...); -extern void qedf_dbg_warn(struct qedf_dbg_ctx *qedf, const char *func, u32 line, +__printf(4, 5) +void qedf_dbg_warn(struct qedf_dbg_ctx *qedf, const char *func, u32 line, const char *, ...); -extern void qedf_dbg_notice(struct qedf_dbg_ctx *qedf, const char *func, +__printf(4, 5) +void qedf_dbg_notice(struct qedf_dbg_ctx *qedf, const char *func, u32 line, const char *, ...); -extern void qedf_dbg_info(struct qedf_dbg_ctx *qedf, const char *func, u32 line, +__printf(5, 6) +void qedf_dbg_info(struct qedf_dbg_ctx *qedf, const char *func, u32 line, u32 info, const char *fmt, ...); /* GRC Dump related defines */ diff --git a/drivers/scsi/qedf/qedf_fip.c b/drivers/scsi/qedf/qedf_fip.c index 868d423380d1..ed58b9104f58 100644 --- a/drivers/scsi/qedf/qedf_fip.c +++ b/drivers/scsi/qedf/qedf_fip.c @@ -203,7 +203,7 @@ void qedf_fip_recv(struct qedf_ctx *qedf, struct sk_buff *skb) case FIP_DT_MAC: mp = (struct fip_mac_desc *)desc; QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_LL2, - "fd_mac=%pM.\n", __func__, mp->fd_mac); + "fd_mac=%pM\n", mp->fd_mac); ether_addr_copy(cvl_mac, mp->fd_mac); break; case FIP_DT_NAME: diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c index ee0dcf9d3aba..46debe5034af 100644 --- a/drivers/scsi/qedf/qedf_io.c +++ b/drivers/scsi/qedf/qedf_io.c @@ -1342,7 +1342,7 @@ void qedf_scsi_completion(struct qedf_ctx *qedf, struct fcoe_cqe *cqe, } else { refcount = kref_read(&io_req->refcount); QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, - "%d:0:%d:%d xid=0x%0x op=0x%02x " + "%d:0:%d:%lld xid=0x%0x op=0x%02x " "lba=%02x%02x%02x%02x cdb_status=%d " "fcp_resid=0x%x refcount=%d.\n", qedf->lport->host->host_no, sc_cmd->device->id, @@ -1426,7 +1426,7 @@ void qedf_scsi_done(struct qedf_ctx *qedf, struct qedf_ioreq *io_req, sc_cmd->result = result << 16; refcount = kref_read(&io_req->refcount); - QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, "%d:0:%d:%d: Completing " + QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO, "%d:0:%d:%lld: Completing " "sc_cmd=%p result=0x%08x op=0x%02x lba=0x%02x%02x%02x%02x, " "allowed=%d retries=%d refcount=%d.\n", qedf->lport->host->host_no, sc_cmd->device->id, diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index d9d7a86b5f8b..8e2a160490e6 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -2456,8 +2456,8 @@ static int qedf_alloc_bdq(struct qedf_ctx *qedf) } QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, - "BDQ PBL addr=0x%p dma=0x%llx.\n", qedf->bdq_pbl, - qedf->bdq_pbl_dma); + "BDQ PBL addr=0x%p dma=%pad\n", + qedf->bdq_pbl, &qedf->bdq_pbl_dma); /* * Populate BDQ PBL with physical and virtual address of individual diff --git a/drivers/scsi/qedi/qedi_debugfs.c b/drivers/scsi/qedi/qedi_debugfs.c index 955936274241..59417199bf36 100644 --- a/drivers/scsi/qedi/qedi_debugfs.c +++ b/drivers/scsi/qedi/qedi_debugfs.c @@ -14,7 +14,7 @@ #include <linux/debugfs.h> #include <linux/module.h> -int do_not_recover; +int qedi_do_not_recover; static struct dentry *qedi_dbg_root; void @@ -74,22 +74,22 @@ qedi_dbg_exit(void) static ssize_t qedi_dbg_do_not_recover_enable(struct qedi_dbg_ctx *qedi_dbg) { - if (!do_not_recover) - do_not_recover = 1; + if (!qedi_do_not_recover) + qedi_do_not_recover = 1; QEDI_INFO(qedi_dbg, QEDI_LOG_DEBUGFS, "do_not_recover=%d\n", - do_not_recover); + qedi_do_not_recover); return 0; } static ssize_t qedi_dbg_do_not_recover_disable(struct qedi_dbg_ctx *qedi_dbg) { - if (do_not_recover) - do_not_recover = 0; + if (qedi_do_not_recover) + qedi_do_not_recover = 0; QEDI_INFO(qedi_dbg, QEDI_LOG_DEBUGFS, "do_not_recover=%d\n", - do_not_recover); + qedi_do_not_recover); return 0; } @@ -141,7 +141,7 @@ qedi_dbg_do_not_recover_cmd_read(struct file *filp, char __user *buffer, if (*ppos) return 0; - cnt = sprintf(buffer, "do_not_recover=%d\n", do_not_recover); + cnt = sprintf(buffer, "do_not_recover=%d\n", qedi_do_not_recover); cnt = min_t(int, count, cnt - *ppos); *ppos += cnt; return cnt; diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c index c9f0ef4e11b3..2bce3efc66a4 100644 --- a/drivers/scsi/qedi/qedi_fw.c +++ b/drivers/scsi/qedi/qedi_fw.c @@ -1461,9 +1461,9 @@ static void qedi_tmf_work(struct work_struct *work) get_itt(tmf_hdr->rtt), get_itt(ctask->itt), cmd->task_id, qedi_conn->iscsi_conn_id); - if (do_not_recover) { + if (qedi_do_not_recover) { QEDI_ERR(&qedi->dbg_ctx, "DONT SEND CLEANUP/ABORT %d\n", - do_not_recover); + qedi_do_not_recover); goto abort_ret; } diff --git a/drivers/scsi/qedi/qedi_gbl.h b/drivers/scsi/qedi/qedi_gbl.h index 8e488de88ece..63d793f46064 100644 --- a/drivers/scsi/qedi/qedi_gbl.h +++ b/drivers/scsi/qedi/qedi_gbl.h @@ -12,8 +12,14 @@ #include "qedi_iscsi.h" +#ifdef CONFIG_DEBUG_FS +extern int qedi_do_not_recover; +#else +#define qedi_do_not_recover (0) +#endif + extern uint qedi_io_tracing; -extern int do_not_recover; + extern struct scsi_host_template qedi_host_template; extern struct iscsi_transport qedi_iscsi_transport; extern const struct qed_iscsi_ops *qedi_ops; diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index b9f79d36142d..4cc474364c50 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -833,7 +833,7 @@ qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, return ERR_PTR(ret); } - if (do_not_recover) { + if (qedi_do_not_recover) { ret = -ENOMEM; return ERR_PTR(ret); } @@ -957,7 +957,7 @@ static int qedi_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) struct qedi_endpoint *qedi_ep; int ret = 0; - if (do_not_recover) + if (qedi_do_not_recover) return 1; qedi_ep = ep->dd_data; @@ -1025,7 +1025,7 @@ static void qedi_ep_disconnect(struct iscsi_endpoint *ep) } if (test_bit(QEDI_IN_RECOVERY, &qedi->flags)) { - if (do_not_recover) { + if (qedi_do_not_recover) { QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO, "Do not recover cid=0x%x\n", qedi_ep->iscsi_cid); @@ -1039,7 +1039,7 @@ static void qedi_ep_disconnect(struct iscsi_endpoint *ep) } } - if (do_not_recover) + if (qedi_do_not_recover) goto ep_exit_recover; switch (qedi_ep->state) { diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index 5eda21d903e9..8e3d92807cb8 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -1805,7 +1805,7 @@ static int __qedi_probe(struct pci_dev *pdev, int mode) */ qedi_ops->common->update_pf_params(qedi->cdev, &qedi->pf_params); - qedi_setup_int(qedi); + rc = qedi_setup_int(qedi); if (rc) goto stop_iscsi_func; diff --git a/drivers/scsi/qla2xxx/qla_dbg.c b/drivers/scsi/qla2xxx/qla_dbg.c index 21d9fb7fc887..51b4179469d1 100644 --- a/drivers/scsi/qla2xxx/qla_dbg.c +++ b/drivers/scsi/qla2xxx/qla_dbg.c @@ -2707,13 +2707,9 @@ ql_dump_buffer(uint32_t level, scsi_qla_host_t *vha, int32_t id, "%-+5d 0 1 2 3 4 5 6 7 8 9 A B C D E F\n", size); ql_dbg(level, vha, id, "----- -----------------------------------------------\n"); - for (cnt = 0; cnt < size; cnt++, buf++) { - if (cnt % 16 == 0) - ql_dbg(level, vha, id, "%04x:", cnt & ~0xFU); - printk(" %02x", *buf); - if (cnt % 16 == 15) - printk("\n"); + for (cnt = 0; cnt < size; cnt += 16) { + ql_dbg(level, vha, id, "%04x: ", cnt); + print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, + buf + cnt, min(16U, size - cnt), false); } - if (cnt % 16 != 0) - printk("\n"); } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ba2286652ff6..19125d72f322 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2932,6 +2932,8 @@ EXPORT_SYMBOL(scsi_target_resume); /** * scsi_internal_device_block - internal function to put a device temporarily into the SDEV_BLOCK state * @sdev: device to block + * @wait: Whether or not to wait until ongoing .queuecommand() / + * .queue_rq() calls have finished. * * Block request made by scsi lld's to temporarily stop all * scsi commands on the specified device. May sleep. @@ -2949,7 +2951,7 @@ EXPORT_SYMBOL(scsi_target_resume); * remove the rport mutex lock and unlock calls from srp_queuecommand(). */ int -scsi_internal_device_block(struct scsi_device *sdev) +scsi_internal_device_block(struct scsi_device *sdev, bool wait) { struct request_queue *q = sdev->request_queue; unsigned long flags; @@ -2969,12 +2971,16 @@ scsi_internal_device_block(struct scsi_device *sdev) * request queue. */ if (q->mq_ops) { - blk_mq_quiesce_queue(q); + if (wait) + blk_mq_quiesce_queue(q); + else + blk_mq_stop_hw_queues(q); } else { spin_lock_irqsave(q->queue_lock, flags); blk_stop_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); - scsi_wait_for_queuecommand(sdev); + if (wait) + scsi_wait_for_queuecommand(sdev); } return 0; @@ -3036,7 +3042,7 @@ EXPORT_SYMBOL_GPL(scsi_internal_device_unblock); static void device_block(struct scsi_device *sdev, void *data) { - scsi_internal_device_block(sdev); + scsi_internal_device_block(sdev, true); } static int diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h index 99bfc985e190..f11bd102d6d5 100644 --- a/drivers/scsi/scsi_priv.h +++ b/drivers/scsi/scsi_priv.h @@ -188,8 +188,5 @@ static inline void scsi_dh_remove_device(struct scsi_device *sdev) { } */ #define SCSI_DEVICE_BLOCK_MAX_TIMEOUT 600 /* units in seconds */ -extern int scsi_internal_device_block(struct scsi_device *sdev); -extern int scsi_internal_device_unblock(struct scsi_device *sdev, - enum scsi_device_state new_state); #endif /* _SCSI_PRIV_H */ diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index d277e8620e3e..fcfeddc79331 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1783,6 +1783,8 @@ static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt); + unsigned int sector_size = SCpnt->device->sector_size; + unsigned int resid; struct scsi_sense_hdr sshdr; struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk); struct request *req = SCpnt->request; @@ -1813,6 +1815,21 @@ static int sd_done(struct scsi_cmnd *SCpnt) scsi_set_resid(SCpnt, blk_rq_bytes(req)); } break; + default: + /* + * In case of bogus fw or device, we could end up having + * an unaligned partial completion. Check this here and force + * alignment. + */ + resid = scsi_get_resid(SCpnt); + if (resid & (sector_size - 1)) { + sd_printk(KERN_INFO, sdkp, + "Unaligned partial completion (resid=%u, sector_sz=%u)\n", + resid, sector_size); + resid = min(scsi_bufflen(SCpnt), + round_up(resid, sector_size)); + scsi_set_resid(SCpnt, resid); + } } if (result) { diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 638e5f427c90..016639d7fef1 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -400,8 +400,6 @@ MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to subchannels") */ static int storvsc_timeout = 180; -static int msft_blist_flags = BLIST_TRY_VPD_PAGES; - #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) static struct scsi_transport_template *fc_transport_template; #endif @@ -1383,6 +1381,22 @@ static int storvsc_do_io(struct hv_device *device, return ret; } +static int storvsc_device_alloc(struct scsi_device *sdevice) +{ + /* + * Set blist flag to permit the reading of the VPD pages even when + * the target may claim SPC-2 compliance. MSFT targets currently + * claim SPC-2 compliance while they implement post SPC-2 features. + * With this flag we can correctly handle WRITE_SAME_16 issues. + * + * Hypervisor reports SCSI_UNKNOWN type for DVD ROM device but + * still supports REPORT LUN. + */ + sdevice->sdev_bflags = BLIST_REPORTLUN2 | BLIST_TRY_VPD_PAGES; + + return 0; +} + static int storvsc_device_configure(struct scsi_device *sdevice) { @@ -1396,14 +1410,6 @@ static int storvsc_device_configure(struct scsi_device *sdevice) sdevice->no_write_same = 1; /* - * Add blist flags to permit the reading of the VPD pages even when - * the target may claim SPC-2 compliance. MSFT targets currently - * claim SPC-2 compliance while they implement post SPC-2 features. - * With this patch we can correctly handle WRITE_SAME_16 issues. - */ - sdevice->sdev_bflags |= msft_blist_flags; - - /* * If the host is WIN8 or WIN8 R2, claim conformance to SPC-3 * if the device is a MSFT virtual device. If the host is * WIN10 or newer, allow write_same. @@ -1661,6 +1667,7 @@ static struct scsi_host_template scsi_driver = { .eh_host_reset_handler = storvsc_host_reset_handler, .proc_name = "storvsc_host", .eh_timed_out = storvsc_eh_timed_out, + .slave_alloc = storvsc_device_alloc, .slave_configure = storvsc_device_configure, .cmd_per_lun = 255, .this_id = -1, diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h index 318e4a1f76c9..54deeb754db5 100644 --- a/drivers/scsi/ufs/ufs.h +++ b/drivers/scsi/ufs/ufs.h @@ -146,7 +146,7 @@ enum attr_idn { /* Descriptor idn for Query requests */ enum desc_idn { QUERY_DESC_IDN_DEVICE = 0x0, - QUERY_DESC_IDN_CONFIGURAION = 0x1, + QUERY_DESC_IDN_CONFIGURATION = 0x1, QUERY_DESC_IDN_UNIT = 0x2, QUERY_DESC_IDN_RFU_0 = 0x3, QUERY_DESC_IDN_INTERCONNECT = 0x4, @@ -162,19 +162,13 @@ enum desc_header_offset { QUERY_DESC_DESC_TYPE_OFFSET = 0x01, }; -enum ufs_desc_max_size { - QUERY_DESC_DEVICE_MAX_SIZE = 0x40, - QUERY_DESC_CONFIGURAION_MAX_SIZE = 0x90, - QUERY_DESC_UNIT_MAX_SIZE = 0x23, - QUERY_DESC_INTERCONNECT_MAX_SIZE = 0x06, - /* - * Max. 126 UNICODE characters (2 bytes per character) plus 2 bytes - * of descriptor header. - */ - QUERY_DESC_STRING_MAX_SIZE = 0xFE, - QUERY_DESC_GEOMETRY_MAX_SIZE = 0x44, - QUERY_DESC_POWER_MAX_SIZE = 0x62, - QUERY_DESC_RFU_MAX_SIZE = 0x00, +enum ufs_desc_def_size { + QUERY_DESC_DEVICE_DEF_SIZE = 0x40, + QUERY_DESC_CONFIGURATION_DEF_SIZE = 0x90, + QUERY_DESC_UNIT_DEF_SIZE = 0x23, + QUERY_DESC_INTERCONNECT_DEF_SIZE = 0x06, + QUERY_DESC_GEOMETRY_DEF_SIZE = 0x44, + QUERY_DESC_POWER_DEF_SIZE = 0x62, }; /* Unit descriptor parameters offsets in bytes*/ diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index dc6efbd1be8e..1359913bf840 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -100,19 +100,6 @@ #define ufshcd_hex_dump(prefix_str, buf, len) \ print_hex_dump(KERN_ERR, prefix_str, DUMP_PREFIX_OFFSET, 16, 4, buf, len, false) -static u32 ufs_query_desc_max_size[] = { - QUERY_DESC_DEVICE_MAX_SIZE, - QUERY_DESC_CONFIGURAION_MAX_SIZE, - QUERY_DESC_UNIT_MAX_SIZE, - QUERY_DESC_RFU_MAX_SIZE, - QUERY_DESC_INTERCONNECT_MAX_SIZE, - QUERY_DESC_STRING_MAX_SIZE, - QUERY_DESC_RFU_MAX_SIZE, - QUERY_DESC_GEOMETRY_MAX_SIZE, - QUERY_DESC_POWER_MAX_SIZE, - QUERY_DESC_RFU_MAX_SIZE, -}; - enum { UFSHCD_MAX_CHANNEL = 0, UFSHCD_MAX_ID = 1, @@ -2857,7 +2844,7 @@ static int __ufshcd_query_descriptor(struct ufs_hba *hba, goto out; } - if (*buf_len <= QUERY_DESC_MIN_SIZE || *buf_len > QUERY_DESC_MAX_SIZE) { + if (*buf_len < QUERY_DESC_MIN_SIZE || *buf_len > QUERY_DESC_MAX_SIZE) { dev_err(hba->dev, "%s: descriptor buffer size (%d) is out of range\n", __func__, *buf_len); err = -EINVAL; @@ -2938,6 +2925,92 @@ static int ufshcd_query_descriptor_retry(struct ufs_hba *hba, } /** + * ufshcd_read_desc_length - read the specified descriptor length from header + * @hba: Pointer to adapter instance + * @desc_id: descriptor idn value + * @desc_index: descriptor index + * @desc_length: pointer to variable to read the length of descriptor + * + * Return 0 in case of success, non-zero otherwise + */ +static int ufshcd_read_desc_length(struct ufs_hba *hba, + enum desc_idn desc_id, + int desc_index, + int *desc_length) +{ + int ret; + u8 header[QUERY_DESC_HDR_SIZE]; + int header_len = QUERY_DESC_HDR_SIZE; + + if (desc_id >= QUERY_DESC_IDN_MAX) + return -EINVAL; + + ret = ufshcd_query_descriptor_retry(hba, UPIU_QUERY_OPCODE_READ_DESC, + desc_id, desc_index, 0, header, + &header_len); + + if (ret) { + dev_err(hba->dev, "%s: Failed to get descriptor header id %d", + __func__, desc_id); + return ret; + } else if (desc_id != header[QUERY_DESC_DESC_TYPE_OFFSET]) { + dev_warn(hba->dev, "%s: descriptor header id %d and desc_id %d mismatch", + __func__, header[QUERY_DESC_DESC_TYPE_OFFSET], + desc_id); + ret = -EINVAL; + } + + *desc_length = header[QUERY_DESC_LENGTH_OFFSET]; + return ret; + +} + +/** + * ufshcd_map_desc_id_to_length - map descriptor IDN to its length + * @hba: Pointer to adapter instance + * @desc_id: descriptor idn value + * @desc_len: mapped desc length (out) + * + * Return 0 in case of success, non-zero otherwise + */ +int ufshcd_map_desc_id_to_length(struct ufs_hba *hba, + enum desc_idn desc_id, int *desc_len) +{ + switch (desc_id) { + case QUERY_DESC_IDN_DEVICE: + *desc_len = hba->desc_size.dev_desc; + break; + case QUERY_DESC_IDN_POWER: + *desc_len = hba->desc_size.pwr_desc; + break; + case QUERY_DESC_IDN_GEOMETRY: + *desc_len = hba->desc_size.geom_desc; + break; + case QUERY_DESC_IDN_CONFIGURATION: + *desc_len = hba->desc_size.conf_desc; + break; + case QUERY_DESC_IDN_UNIT: + *desc_len = hba->desc_size.unit_desc; + break; + case QUERY_DESC_IDN_INTERCONNECT: + *desc_len = hba->desc_size.interc_desc; + break; + case QUERY_DESC_IDN_STRING: + *desc_len = QUERY_DESC_MAX_SIZE; + break; + case QUERY_DESC_IDN_RFU_0: + case QUERY_DESC_IDN_RFU_1: + *desc_len = 0; + break; + default: + *desc_len = 0; + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(ufshcd_map_desc_id_to_length); + +/** * ufshcd_read_desc_param - read the specified descriptor parameter * @hba: Pointer to adapter instance * @desc_id: descriptor idn value @@ -2951,42 +3024,49 @@ static int ufshcd_query_descriptor_retry(struct ufs_hba *hba, static int ufshcd_read_desc_param(struct ufs_hba *hba, enum desc_idn desc_id, int desc_index, - u32 param_offset, + u8 param_offset, u8 *param_read_buf, - u32 param_size) + u8 param_size) { int ret; u8 *desc_buf; - u32 buff_len; + int buff_len; bool is_kmalloc = true; - /* safety checks */ - if (desc_id >= QUERY_DESC_IDN_MAX) + /* Safety check */ + if (desc_id >= QUERY_DESC_IDN_MAX || !param_size) return -EINVAL; - buff_len = ufs_query_desc_max_size[desc_id]; - if ((param_offset + param_size) > buff_len) - return -EINVAL; + /* Get the max length of descriptor from structure filled up at probe + * time. + */ + ret = ufshcd_map_desc_id_to_length(hba, desc_id, &buff_len); - if (!param_offset && (param_size == buff_len)) { - /* memory space already available to hold full descriptor */ - desc_buf = param_read_buf; - is_kmalloc = false; - } else { - /* allocate memory to hold full descriptor */ + /* Sanity checks */ + if (ret || !buff_len) { + dev_err(hba->dev, "%s: Failed to get full descriptor length", + __func__); + return ret; + } + + /* Check whether we need temp memory */ + if (param_offset != 0 || param_size < buff_len) { desc_buf = kmalloc(buff_len, GFP_KERNEL); if (!desc_buf) return -ENOMEM; + } else { + desc_buf = param_read_buf; + is_kmalloc = false; } + /* Request for full descriptor */ ret = ufshcd_query_descriptor_retry(hba, UPIU_QUERY_OPCODE_READ_DESC, - desc_id, desc_index, 0, desc_buf, - &buff_len); + desc_id, desc_index, 0, + desc_buf, &buff_len); if (ret) { dev_err(hba->dev, "%s: Failed reading descriptor. desc_id %d, desc_index %d, param_offset %d, ret %d", __func__, desc_id, desc_index, param_offset, ret); - goto out; } @@ -2998,25 +3078,9 @@ static int ufshcd_read_desc_param(struct ufs_hba *hba, goto out; } - /* - * While reading variable size descriptors (like string descriptor), - * some UFS devices may report the "LENGTH" (field in "Transaction - * Specific fields" of Query Response UPIU) same as what was requested - * in Query Request UPIU instead of reporting the actual size of the - * variable size descriptor. - * Although it's safe to ignore the "LENGTH" field for variable size - * descriptors as we can always derive the length of the descriptor from - * the descriptor header fields. Hence this change impose the length - * match check only for fixed size descriptors (for which we always - * request the correct size as part of Query Request UPIU). - */ - if ((desc_id != QUERY_DESC_IDN_STRING) && - (buff_len != desc_buf[QUERY_DESC_LENGTH_OFFSET])) { - dev_err(hba->dev, "%s: desc_buf length mismatch: buff_len %d, buff_len(desc_header) %d", - __func__, buff_len, desc_buf[QUERY_DESC_LENGTH_OFFSET]); - ret = -EINVAL; - goto out; - } + /* Check wherher we will not copy more data, than available */ + if (is_kmalloc && param_size > buff_len) + param_size = buff_len; if (is_kmalloc) memcpy(param_read_buf, &desc_buf[param_offset], param_size); @@ -5919,8 +5983,8 @@ static int ufshcd_set_icc_levels_attr(struct ufs_hba *hba, u32 icc_level) static void ufshcd_init_icc_levels(struct ufs_hba *hba) { int ret; - int buff_len = QUERY_DESC_POWER_MAX_SIZE; - u8 desc_buf[QUERY_DESC_POWER_MAX_SIZE]; + int buff_len = hba->desc_size.pwr_desc; + u8 desc_buf[hba->desc_size.pwr_desc]; ret = ufshcd_read_power_desc(hba, desc_buf, buff_len); if (ret) { @@ -6017,11 +6081,10 @@ static int ufs_get_device_desc(struct ufs_hba *hba, { int err; u8 model_index; - u8 str_desc_buf[QUERY_DESC_STRING_MAX_SIZE + 1] = {0}; - u8 desc_buf[QUERY_DESC_DEVICE_MAX_SIZE]; + u8 str_desc_buf[QUERY_DESC_MAX_SIZE + 1] = {0}; + u8 desc_buf[hba->desc_size.dev_desc]; - err = ufshcd_read_device_desc(hba, desc_buf, - QUERY_DESC_DEVICE_MAX_SIZE); + err = ufshcd_read_device_desc(hba, desc_buf, hba->desc_size.dev_desc); if (err) { dev_err(hba->dev, "%s: Failed reading Device Desc. err = %d\n", __func__, err); @@ -6038,14 +6101,14 @@ static int ufs_get_device_desc(struct ufs_hba *hba, model_index = desc_buf[DEVICE_DESC_PARAM_PRDCT_NAME]; err = ufshcd_read_string_desc(hba, model_index, str_desc_buf, - QUERY_DESC_STRING_MAX_SIZE, ASCII_STD); + QUERY_DESC_MAX_SIZE, ASCII_STD); if (err) { dev_err(hba->dev, "%s: Failed reading Product Name. err = %d\n", __func__, err); goto out; } - str_desc_buf[QUERY_DESC_STRING_MAX_SIZE] = '\0'; + str_desc_buf[QUERY_DESC_MAX_SIZE] = '\0'; strlcpy(dev_desc->model, (str_desc_buf + QUERY_DESC_HDR_SIZE), min_t(u8, str_desc_buf[QUERY_DESC_LENGTH_OFFSET], MAX_MODEL_LEN)); @@ -6251,6 +6314,51 @@ static void ufshcd_clear_dbg_ufs_stats(struct ufs_hba *hba) hba->req_abort_count = 0; } +static void ufshcd_init_desc_sizes(struct ufs_hba *hba) +{ + int err; + + err = ufshcd_read_desc_length(hba, QUERY_DESC_IDN_DEVICE, 0, + &hba->desc_size.dev_desc); + if (err) + hba->desc_size.dev_desc = QUERY_DESC_DEVICE_DEF_SIZE; + + err = ufshcd_read_desc_length(hba, QUERY_DESC_IDN_POWER, 0, + &hba->desc_size.pwr_desc); + if (err) + hba->desc_size.pwr_desc = QUERY_DESC_POWER_DEF_SIZE; + + err = ufshcd_read_desc_length(hba, QUERY_DESC_IDN_INTERCONNECT, 0, + &hba->desc_size.interc_desc); + if (err) + hba->desc_size.interc_desc = QUERY_DESC_INTERCONNECT_DEF_SIZE; + + err = ufshcd_read_desc_length(hba, QUERY_DESC_IDN_CONFIGURATION, 0, + &hba->desc_size.conf_desc); + if (err) + hba->desc_size.conf_desc = QUERY_DESC_CONFIGURATION_DEF_SIZE; + + err = ufshcd_read_desc_length(hba, QUERY_DESC_IDN_UNIT, 0, + &hba->desc_size.unit_desc); + if (err) + hba->desc_size.unit_desc = QUERY_DESC_UNIT_DEF_SIZE; + + err = ufshcd_read_desc_length(hba, QUERY_DESC_IDN_GEOMETRY, 0, + &hba->desc_size.geom_desc); + if (err) + hba->desc_size.geom_desc = QUERY_DESC_GEOMETRY_DEF_SIZE; +} + +static void ufshcd_def_desc_sizes(struct ufs_hba *hba) +{ + hba->desc_size.dev_desc = QUERY_DESC_DEVICE_DEF_SIZE; + hba->desc_size.pwr_desc = QUERY_DESC_POWER_DEF_SIZE; + hba->desc_size.interc_desc = QUERY_DESC_INTERCONNECT_DEF_SIZE; + hba->desc_size.conf_desc = QUERY_DESC_CONFIGURATION_DEF_SIZE; + hba->desc_size.unit_desc = QUERY_DESC_UNIT_DEF_SIZE; + hba->desc_size.geom_desc = QUERY_DESC_GEOMETRY_DEF_SIZE; +} + /** * ufshcd_probe_hba - probe hba to detect device and initialize * @hba: per-adapter instance @@ -6285,6 +6393,9 @@ static int ufshcd_probe_hba(struct ufs_hba *hba) if (ret) goto out; + /* Init check for device descriptor sizes */ + ufshcd_init_desc_sizes(hba); + ret = ufs_get_device_desc(hba, &card); if (ret) { dev_err(hba->dev, "%s: Failed getting device info. err = %d\n", @@ -6320,6 +6431,7 @@ static int ufshcd_probe_hba(struct ufs_hba *hba) /* set the state as operational after switching to desired gear */ hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL; + /* * If we are in error handling context or in power management callbacks * context, no need to scan the host @@ -7774,6 +7886,9 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) hba->mmio_base = mmio_base; hba->irq = irq; + /* Set descriptor lengths to specification defaults */ + ufshcd_def_desc_sizes(hba); + err = ufshcd_hba_init(hba); if (err) goto out_error; diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 7630600217a2..cdc8bd05f7df 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -220,6 +220,15 @@ struct ufs_dev_cmd { struct ufs_query query; }; +struct ufs_desc_size { + int dev_desc; + int pwr_desc; + int geom_desc; + int interc_desc; + int unit_desc; + int conf_desc; +}; + /** * struct ufs_clk_info - UFS clock related info * @list: list headed by hba->clk_list_head @@ -483,6 +492,7 @@ struct ufs_stats { * @clk_list_head: UFS host controller clocks list node head * @pwr_info: holds current power mode * @max_pwr_info: keeps the device max valid pwm + * @desc_size: descriptor sizes reported by device * @urgent_bkops_lvl: keeps track of urgent bkops level for device * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for * device is known or not. @@ -666,6 +676,7 @@ struct ufs_hba { bool is_urgent_bkops_lvl_checked; struct rw_semaphore clk_scaling_lock; + struct ufs_desc_size desc_size; }; /* Returns true if clocks can be gated. Otherwise false */ @@ -832,6 +843,10 @@ int ufshcd_query_flag(struct ufs_hba *hba, enum query_opcode opcode, enum flag_idn idn, bool *flag_res); int ufshcd_hold(struct ufs_hba *hba, bool async); void ufshcd_release(struct ufs_hba *hba); + +int ufshcd_map_desc_id_to_length(struct ufs_hba *hba, enum desc_idn desc_id, + int *desc_length); + u32 ufshcd_get_local_unipro_ver(struct ufs_hba *hba); /* Wrapper functions for safely calling variant operations */ diff --git a/drivers/scsi/vmw_pvscsi.c b/drivers/scsi/vmw_pvscsi.c index ef474a748744..c374e3b5c678 100644 --- a/drivers/scsi/vmw_pvscsi.c +++ b/drivers/scsi/vmw_pvscsi.c @@ -1487,7 +1487,7 @@ static int pvscsi_probe(struct pci_dev *pdev, const struct pci_device_id *id) irq_flag &= ~PCI_IRQ_MSI; error = pci_alloc_irq_vectors(adapter->dev, 1, 1, irq_flag); - if (error) + if (error < 0) goto out_reset_adapter; adapter->use_req_threshold = pvscsi_setup_req_threshold(adapter, true); diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c index b7b87ecefcdf..9fca8d225ee0 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-socket.c +++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c @@ -532,7 +532,7 @@ lnet_sock_accept(struct socket **newsockp, struct socket *sock) newsock->ops = sock->ops; - rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false); if (rc == -EAGAIN) { /* Nothing ready, so wait for activity */ init_waitqueue_entry(&wait, current); @@ -540,7 +540,7 @@ lnet_sock_accept(struct socket **newsockp, struct socket *sock) set_current_state(TASK_INTERRUPTIBLE); schedule(); remove_wait_queue(sk_sleep(sock->sk), &wait); - rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false); } if (rc) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 7d398d300e97..9382db998ec9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -743,7 +743,7 @@ static int tcp_accept_from_sock(struct connection *con) newsock->type = con->sock->type; newsock->ops = con->sock->ops; - result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK); + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK, true); if (result < 0) goto accept_err; diff --git a/fs/exec.c b/fs/exec.c index 65145a3df065..72934df68471 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); + arch_setup_new_exec(); perf_event_exec(); __set_task_comm(current, kbasename(bprm->filename), true); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ef600591d96f..63ee2940775c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -173,19 +173,33 @@ static void wb_wakeup(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } +static void finish_writeback_work(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + struct wb_completion *done = work->done; + + if (work->auto_free) + kfree(work); + if (done && atomic_dec_and_test(&done->cnt)) + wake_up_all(&wb->bdi->wb_waitq); +} + static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { trace_writeback_queue(wb, work); - spin_lock_bh(&wb->work_lock); - if (!test_bit(WB_registered, &wb->state)) - goto out_unlock; if (work->done) atomic_inc(&work->done->cnt); - list_add_tail(&work->list, &wb->work_list); - mod_delayed_work(bdi_wq, &wb->dwork, 0); -out_unlock: + + spin_lock_bh(&wb->work_lock); + + if (test_bit(WB_registered, &wb->state)) { + list_add_tail(&work->list, &wb->work_list); + mod_delayed_work(bdi_wq, &wb->dwork, 0); + } else + finish_writeback_work(wb, work); + spin_unlock_bh(&wb->work_lock); } @@ -1873,16 +1887,9 @@ static long wb_do_writeback(struct bdi_writeback *wb) set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { - struct wb_completion *done = work->done; - trace_writeback_exec(wb, work); - wrote += wb_writeback(wb, work); - - if (work->auto_free) - kfree(work); - if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(&wb->bdi->wb_waitq); + finish_writeback_work(wb, work); } /* diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c45084ac642d..511e1ed7e2de 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -207,7 +207,7 @@ struct lm_lockname { struct gfs2_sbd *ln_sbd; u64 ln_number; unsigned int ln_type; -}; +} __packed __aligned(sizeof(int)); #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 4348027384f5..d0ab7e56d0b4 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1863,7 +1863,7 @@ static int o2net_accept_one(struct socket *sock, int *more) new_sock->type = sock->type; new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false); if (ret < 0) goto out; diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index d04547fcf274..eb00bc133bca 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -125,6 +125,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp, + int size); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index c6809ff41197..96b45cd6c63f 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -629,6 +629,93 @@ xfs_dir2_sf_check( } #endif /* DEBUG */ +/* Verify the consistency of an inline directory. */ +int +xfs_dir2_sf_verify( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int size) +{ + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next_sfep; + char *endp; + const struct xfs_dir_ops *dops; + xfs_ino_t ino; + int i; + int i8count; + int offset; + __uint8_t filetype; + + dops = xfs_dir_get_ops(mp, NULL); + + /* + * Give up if the directory is way too short. + */ + XFS_WANT_CORRUPTED_RETURN(mp, size > + offsetof(struct xfs_dir2_sf_hdr, parent)); + XFS_WANT_CORRUPTED_RETURN(mp, size >= + xfs_dir2_sf_hdr_size(sfp->i8count)); + + endp = (char *)sfp + size; + + /* Check .. entry */ + ino = dops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + offset = dops->data_first_offset; + + /* Check all reported entries */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + /* + * struct xfs_dir2_sf_entry has a variable length. + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ + XFS_WANT_CORRUPTED_RETURN(mp, + ((char *)sfep + sizeof(*sfep)) < endp); + + /* Don't allow names with known bad length. */ + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0); + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN); + + /* + * Check that the variable-length part of the structure is + * within the data buffer. The next entry starts after the + * name component, so nextentry is an acceptable test. + */ + next_sfep = dops->sf_nextentry(sfp, sfep); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep); + + /* Check that the offsets always increase. */ + XFS_WANT_CORRUPTED_RETURN(mp, + xfs_dir2_sf_get_offset(sfep) >= offset); + + /* Check the inode number. */ + ino = dops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + + /* Check the file type. */ + filetype = dops->sf_get_ftype(sfep); + XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX); + + offset = xfs_dir2_sf_get_offset(sfep) + + dops->data_entsize(sfep->namelen); + + sfep = next_sfep; + } + XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count); + XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp); + + /* Make sure this whole thing ought to be in local format. */ + XFS_WANT_CORRUPTED_RETURN(mp, offset + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize); + + return 0; +} + /* * Create a new (shortform) directory. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 25c1e078aef6..9653e964eda4 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -33,6 +33,8 @@ #include "xfs_trace.h" #include "xfs_attr_sf.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" kmem_zone_t *xfs_ifork_zone; @@ -320,6 +322,7 @@ xfs_iformat_local( int whichfork, int size) { + int error; /* * If the size is unreasonable, then something @@ -336,6 +339,14 @@ xfs_iformat_local( return -EFSCORRUPTED; } + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(ip->i_mount, + (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip), + size); + if (error) + return error; + } + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -856,7 +867,7 @@ xfs_iextents_copy( * In these cases, the format always takes precedence, because the * format indicates the current state of the fork. */ -void +int xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, @@ -866,6 +877,7 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; + int error; static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -874,7 +886,7 @@ xfs_iflush_fork( { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; if (!iip) - return; + return 0; ifp = XFS_IFORK_PTR(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, @@ -882,12 +894,19 @@ xfs_iflush_fork( */ if (!ifp) { ASSERT(whichfork == XFS_ATTR_FORK); - return; + return 0; } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_LOCAL: + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(mp, + (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data, + ifp->if_bytes); + if (error) + return error; + } if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); @@ -940,6 +959,7 @@ xfs_iflush_fork( ASSERT(0); break; } + return 0; } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365326d1..132dc59fdde6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -140,7 +140,7 @@ typedef struct xfs_ifork { struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); -void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, +int xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); void xfs_idata_realloc(struct xfs_inode *, int, int); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 003a99b83bd8..ad9396e516f6 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -71,22 +71,11 @@ xfs_dir2_sf_getdents( struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) - return -EFSCORRUPTED; - /* * If the block number in the offset is out of range, we're done. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7eaf1ef74e3c..c7fe2c2123ab 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3475,6 +3475,7 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); @@ -3557,9 +3558,14 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (error) + return error; + if (XFS_IFORK_Q(ip)) { + error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + if (error) + return error; + } xfs_inobp_check(mp, bp); /* diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index cc5d9a1405df..41e5b6784b97 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -32,10 +32,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif /* _ASM_GENERIC_MM_HOOKS_H */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 1fad160f35de..7dfa767dc680 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -341,6 +341,31 @@ static inline int pte_unused(pte_t pte) } #endif +#ifndef pte_access_permitted +#define pte_access_permitted(pte, write) \ + (pte_present(pte) && (!(write) || pte_write(pte))) +#endif + +#ifndef pmd_access_permitted +#define pmd_access_permitted(pmd, write) \ + (pmd_present(pmd) && (!(write) || pmd_write(pmd))) +#endif + +#ifndef pud_access_permitted +#define pud_access_permitted(pud, write) \ + (pud_present(pud) && (!(write) || pud_write(pud))) +#endif + +#ifndef p4d_access_permitted +#define p4d_access_permitted(p4d, write) \ + (p4d_present(p4d) && (!(write) || p4d_write(p4d))) +#endif + +#ifndef pgd_access_permitted +#define pgd_access_permitted(pgd, write) \ + (pgd_present(pgd) && (!(write) || pgd_write(pgd))) +#endif + #ifndef __HAVE_ARCH_PMD_SAME #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index a2bfd7843f18..e2b9c6fe2714 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -73,7 +73,7 @@ int af_alg_unregister_type(const struct af_alg_type *type); int af_alg_release(struct socket *sock); void af_alg_release_parent(struct sock *sk); -int af_alg_accept(struct sock *sk, struct socket *newsock); +int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern); int af_alg_make_sg(struct af_alg_sgl *sgl, struct iov_iter *iter, int len); void af_alg_free_sg(struct af_alg_sgl *sgl); diff --git a/include/linux/compat.h b/include/linux/compat.h index aef47be2a5c1..af9dbc44fd92 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -723,6 +723,8 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, int, const char __user *); +asmlinkage long compat_sys_arch_prctl(int option, unsigned long arg2); + /* * For most but not all architectures, "am I in a compat syscall?" and * "am I a compat task?" are the same question. For architectures on which diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 61d042bbbf60..68449293c4b6 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -163,6 +163,7 @@ struct dccp_request_sock { __u64 dreq_isr; __u64 dreq_gsr; __be32 dreq_service; + spinlock_t dreq_lock; struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; diff --git a/include/linux/filter.h b/include/linux/filter.h index 0c167fdee5f7..fbf7b39e8103 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -409,6 +409,7 @@ struct bpf_prog { u16 pages; /* Number of allocated pages */ kmemcheck_bitfield_begin(meta); u16 jited:1, /* Is our filter JIT'ed? */ + locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ @@ -554,22 +555,29 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) #ifdef CONFIG_ARCH_HAS_SET_MEMORY static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { - set_memory_ro((unsigned long)fp, fp->pages); + fp->locked = 1; + WARN_ON_ONCE(set_memory_ro((unsigned long)fp, fp->pages)); } static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { - set_memory_rw((unsigned long)fp, fp->pages); + if (fp->locked) { + WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); + /* In case set_memory_rw() fails, we want to be the first + * to crash here instead of some random place later on. + */ + fp->locked = 0; + } } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { - set_memory_ro((unsigned long)hdr, hdr->pages); + WARN_ON_ONCE(set_memory_ro((unsigned long)hdr, hdr->pages)); } static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) { - set_memory_rw((unsigned long)hdr, hdr->pages); + WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); } #else static inline void bpf_prog_lock_ro(struct bpf_prog *fp) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 672cfef72fc8..97cbca19430d 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -373,6 +373,8 @@ #define ICC_IGRPEN0_EL1_MASK (1 << ICC_IGRPEN0_EL1_SHIFT) #define ICC_IGRPEN1_EL1_SHIFT 0 #define ICC_IGRPEN1_EL1_MASK (1 << ICC_IGRPEN1_EL1_SHIFT) +#define ICC_SRE_EL1_DIB (1U << 2) +#define ICC_SRE_EL1_DFB (1U << 1) #define ICC_SRE_EL1_SRE (1U << 0) /* diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 188eced6813e..9f3616085423 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -524,6 +524,10 @@ static inline struct irq_domain *irq_find_matching_fwnode( { return NULL; } +static inline bool irq_domain_check_msi_remap(void) +{ + return false; +} #endif /* !CONFIG_IRQ_DOMAIN */ #endif /* _LINUX_IRQDOMAIN_H */ diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index b01fe1009084..87ff4f58a2f0 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -29,6 +29,11 @@ struct hlist_nulls_node { ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_nulls_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ + }) /** * ptr_is_a_nulls - Test if a ptr is a nulls * @ptr: ptr to be tested diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f01c88f0800..e197d3ca3e8a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -430,6 +430,10 @@ static inline int pud_devmap(pud_t pud) { return 0; } +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f60f45fe226f..45cdb27791a3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -367,6 +367,11 @@ struct mm_struct { #endif unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* Base adresses for compatible mmap() */ + unsigned long mmap_compat_base; + unsigned long mmap_compat_legacy_base; +#endif unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; diff --git a/include/linux/net.h b/include/linux/net.h index cd0c8bd0a1de..0620f5e18c96 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -146,7 +146,7 @@ struct proto_ops { int (*socketpair)(struct socket *sock1, struct socket *sock2); int (*accept) (struct socket *sock, - struct socket *newsock, int flags); + struct socket *newsock, int flags, bool kern); int (*getname) (struct socket *sock, struct sockaddr *addr, int *sockaddr_len, int peer); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 84943e8057ef..316a19f6b635 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page) #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic()); + VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif /* * Preempt must be disabled here - we rely on rcu_read_lock doing @@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) # ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic()); + VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif VM_BUG_ON_PAGE(page_count(page) == 0, page); page_ref_add(page, count); diff --git a/include/linux/phy.h b/include/linux/phy.h index 772476028a65..43a774873aa9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -837,6 +837,10 @@ int genphy_read_status(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); int genphy_soft_reset(struct phy_device *phydev); +static inline int genphy_no_soft_reset(struct phy_device *phydev) +{ + return 0; +} void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); int phy_driver_register(struct phy_driver *new_driver, struct module *owner); diff --git a/include/linux/purgatory.h b/include/linux/purgatory.h new file mode 100644 index 000000000000..d60d4e278609 --- /dev/null +++ b/include/linux/purgatory.h @@ -0,0 +1,23 @@ +#ifndef _LINUX_PURGATORY_H +#define _LINUX_PURGATORY_H + +#include <linux/types.h> +#include <crypto/sha.h> +#include <uapi/linux/kexec.h> + +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +/* + * These forward declarations serve two purposes: + * + * 1) Make sparse happy when checking arch/purgatory + * 2) Document that these are required to be global so the symbol + * lookup in kexec works + */ +extern struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX]; +extern u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE]; + +#endif diff --git a/include/linux/random.h b/include/linux/random.h index 7bd2403e4fef..ed5c3838780d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -37,14 +37,26 @@ extern void get_random_bytes(void *buf, int nbytes); extern int add_random_ready_callback(struct random_ready_callback *rdy); extern void del_random_ready_callback(struct random_ready_callback *rdy); extern void get_random_bytes_arch(void *buf, int nbytes); -extern int random_int_secret_init(void); #ifndef MODULE extern const struct file_operations random_fops, urandom_fops; #endif -unsigned int get_random_int(void); -unsigned long get_random_long(void); +u32 get_random_u32(void); +u64 get_random_u64(void); +static inline unsigned int get_random_int(void) +{ + return get_random_u32(); +} +static inline unsigned long get_random_long(void) +{ +#if BITS_PER_LONG == 64 + return get_random_u64(); +#else + return get_random_u32(); +#endif +} + unsigned long randomize_page(unsigned long start, unsigned long range); u32 prandom_u32(void); diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 4ae95f7e8597..a23a33153180 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -156,5 +156,19 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) +/** + * hlist_nulls_for_each_entry_safe - + * iterate over list of given type safe against removal of list entry + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_nulls_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_nulls_node within the struct. + */ +#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ + for (({barrier();}), \ + pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ + pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) #endif #endif diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 58373875e8ee..55125d674338 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -101,6 +101,10 @@ static inline void check_object_size(const void *ptr, unsigned long n, { } #endif /* CONFIG_HARDENED_USERCOPY */ +#ifndef arch_setup_new_exec +static inline void arch_setup_new_exec(void) { } +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index b7952d55b9c0..f39ae697347f 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -20,7 +20,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); -int inet_accept(struct socket *sock, struct socket *newsock, int flags); +int inet_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 826f198374f8..c7a577976bec 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -258,7 +258,7 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, return (unsigned long)min_t(u64, when, max_when); } -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern); int inet_csk_get_port(struct sock *sk, unsigned short snum); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a244db5e5ff7..07a0b128625a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -476,7 +476,8 @@ struct sctp_pf { int (*send_verify) (struct sctp_sock *, union sctp_addr *); int (*supported_addrs)(const struct sctp_sock *, __be16 *); struct sock *(*create_accept_sk) (struct sock *sk, - struct sctp_association *asoc); + struct sctp_association *asoc, + bool kern); int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr); void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); diff --git a/include/net/sock.h b/include/net/sock.h index 5e5997654db6..03252d53975d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -236,6 +236,7 @@ struct sock_common { * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer + * @sk_kern_sock: True if sock is using kernel lock classes * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux @@ -430,7 +431,8 @@ struct sock { #endif kmemcheck_bitfield_begin(flags); - unsigned int sk_padding : 2, + unsigned int sk_padding : 1, + sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, sk_userlocks : 4, @@ -1015,7 +1017,8 @@ struct proto { int addr_len); int (*disconnect)(struct sock *sk, int flags); - struct sock * (*accept)(struct sock *sk, int flags, int *err); + struct sock * (*accept)(struct sock *sk, int flags, int *err, + bool kern); int (*ioctl)(struct sock *sk, int cmd, unsigned long arg); @@ -1573,7 +1576,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); -int sock_no_accept(struct socket *, struct socket *, int); +int sock_no_accept(struct socket *, struct socket *, int, bool); int sock_no_getname(struct socket *, struct sockaddr *, int *, int); unsigned int sock_no_poll(struct file *, struct socket *, struct poll_table_struct *); diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index b0e275de6dec..583875ea136a 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -196,6 +196,7 @@ struct iscsi_conn { struct iscsi_task *task; /* xmit task in progress */ /* xmit */ + spinlock_t taskqueuelock; /* protects the next three lists */ struct list_head mgmtqueue; /* mgmt (control) xmit queue */ struct list_head cmdqueue; /* data-path cmd queue */ struct list_head requeue; /* tasks needing another run */ diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 6f22b39f1b0c..080c7ce9bae8 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -472,6 +472,10 @@ static inline int scsi_device_created(struct scsi_device *sdev) sdev->sdev_state == SDEV_CREATED_BLOCK; } +int scsi_internal_device_block(struct scsi_device *sdev, bool wait); +int scsi_internal_device_unblock(struct scsi_device *sdev, + enum scsi_device_state new_state); + /* accessor functions for the SCSI parameters */ static inline int scsi_device_sync(struct scsi_device *sdev) { diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index bce990f5a35d..31acce9019a6 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -241,21 +241,21 @@ TRACE_EVENT(xen_mmu_set_pud, (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval) ); -TRACE_EVENT(xen_mmu_set_pgd, - TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval), - TP_ARGS(pgdp, user_pgdp, pgdval), +TRACE_EVENT(xen_mmu_set_p4d, + TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval), + TP_ARGS(p4dp, user_p4dp, p4dval), TP_STRUCT__entry( - __field(pgd_t *, pgdp) - __field(pgd_t *, user_pgdp) - __field(pgdval_t, pgdval) - ), - TP_fast_assign(__entry->pgdp = pgdp; - __entry->user_pgdp = user_pgdp; - __entry->pgdval = pgdval.pgd), - TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)", - __entry->pgdp, __entry->user_pgdp, - (int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)), - (int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval) + __field(p4d_t *, p4dp) + __field(p4d_t *, user_p4dp) + __field(p4dval_t, p4dval) + ), + TP_fast_assign(__entry->p4dp = p4dp; + __entry->user_p4dp = user_p4dp; + __entry->p4dval = p4d_val(p4dval)), + TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)", + __entry->p4dp, __entry->user_p4dp, + (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)), + (int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval) ); TRACE_EVENT(xen_mmu_pud_clear, diff --git a/include/uapi/linux/packet_diag.h b/include/uapi/linux/packet_diag.h index d08c63f3dd6f..0c5d5dd61b6a 100644 --- a/include/uapi/linux/packet_diag.h +++ b/include/uapi/linux/packet_diag.h @@ -64,7 +64,7 @@ struct packet_diag_mclist { __u32 pdmc_count; __u16 pdmc_type; __u16 pdmc_alen; - __u8 pdmc_addr[MAX_ADDR_LEN]; + __u8 pdmc_addr[32]; /* MAX_ADDR_LEN */ }; struct packet_diag_ring { diff --git a/init/main.c b/init/main.c index eae2f15657c6..f9c9d9948203 100644 --- a/init/main.c +++ b/init/main.c @@ -882,7 +882,6 @@ static void __init do_basic_setup(void) do_ctors(); usermodehelper_enable(); do_initcalls(); - random_int_secret_init(); } static void __init do_pre_smp_initcalls(void) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ea87fb19a94..afe5bab376c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -13,11 +13,12 @@ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> +#include <linux/rculist_nulls.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" struct bucket { - struct hlist_head head; + struct hlist_nulls_head head; raw_spinlock_t lock; }; @@ -44,9 +45,14 @@ enum extra_elem_state { /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { - struct hlist_node hash_node; - struct bpf_htab *htab; - struct pcpu_freelist_node fnode; + struct hlist_nulls_node hash_node; + struct { + void *padding; + union { + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; + }; }; union { struct rcu_head rcu; @@ -162,7 +168,8 @@ skip_percpu_elems: offsetof(struct htab_elem, lru_node), htab->elem_size, htab->map.max_entries); else - pcpu_freelist_populate(&htab->freelist, htab->elems, + pcpu_freelist_populate(&htab->freelist, + htab->elems + offsetof(struct htab_elem, fnode), htab->elem_size, htab->map.max_entries); return 0; @@ -217,6 +224,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; + BUILD_BUG_ON(offsetof(struct htab_elem, htab) != + offsetof(struct htab_elem, hash_node.pprev)); + BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != + offsetof(struct htab_elem, hash_node.pprev)); + if (lru && !capable(CAP_SYS_ADMIN)) /* LRU implementation is much complicated than other * maps. Hence, limit to CAP_SYS_ADMIN for now. @@ -326,7 +338,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_htab; for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); + INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); } @@ -366,20 +378,44 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) return &htab->buckets[hash & (htab->n_buckets - 1)]; } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) { return &__select_bucket(htab, hash)->head; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, +/* this lookup function can only be called with bucket lock taken */ +static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size) { + struct hlist_nulls_node *n; + struct htab_elem *l; + + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + return NULL; +} + +/* can be called without bucket lock. it will repeat the loop in + * the unlikely event when elements moved from one bucket into another + * while link list is being walked + */ +static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, + u32 hash, void *key, + u32 key_size, u32 n_buckets) +{ + struct hlist_nulls_node *n; struct htab_elem *l; - hlist_for_each_entry_rcu(l, head, hash_node) +again: + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l; + if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) + goto again; + return NULL; } @@ -387,7 +423,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct htab_elem *l; u32 hash, key_size; @@ -400,7 +436,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) head = select_bucket(htab, hash); - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); return l; } @@ -433,8 +469,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { struct bpf_htab *htab = (struct bpf_htab *)arg; - struct htab_elem *l, *tgt_l; - struct hlist_head *head; + struct htab_elem *l = NULL, *tgt_l; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; @@ -444,9 +481,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) raw_spin_lock_irqsave(&b->lock, flags); - hlist_for_each_entry_rcu(l, head, hash_node) + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); break; } @@ -459,7 +496,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct htab_elem *l, *next_l; u32 hash, key_size; int i; @@ -473,7 +510,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) head = select_bucket(htab, hash); /* lookup the key */ - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); if (!l) { i = 0; @@ -481,7 +518,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) } /* key was found, get next key in the same bucket */ - next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), struct htab_elem, hash_node); if (next_l) { @@ -500,7 +537,7 @@ find_first_elem: head = select_bucket(htab, i); /* pick first element in the bucket */ - next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), struct htab_elem, hash_node); if (next_l) { /* if it's not empty, just return it */ @@ -582,9 +619,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, int err = 0; if (prealloc) { - l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); - if (!l_new) + struct pcpu_freelist_node *l; + + l = pcpu_freelist_pop(&htab->freelist); + if (!l) err = -E2BIG; + else + l_new = container_of(l, struct htab_elem, fnode); } else { if (atomic_inc_return(&htab->count) > htab->map.max_entries) { atomic_dec(&htab->count); @@ -661,7 +702,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -700,9 +741,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* add new element to the head of the list, so that * concurrent search will find it before old elem */ - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { - hlist_del_rcu(&l_old->hash_node); + hlist_nulls_del_rcu(&l_old->hash_node); free_htab_elem(htab, l_old); } ret = 0; @@ -716,7 +757,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -757,10 +798,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* add new element to the head of the list, so that * concurrent search will find it before old elem */ - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { bpf_lru_node_set_ref(&l_new->lru_node); - hlist_del_rcu(&l_old->hash_node); + hlist_nulls_del_rcu(&l_old->hash_node); } ret = 0; @@ -781,7 +822,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -820,7 +861,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, ret = PTR_ERR(l_new); goto err; } - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); } ret = 0; err: @@ -834,7 +875,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -882,7 +923,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } else { pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } ret = 0; @@ -910,7 +951,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; @@ -930,7 +971,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); if (l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); ret = 0; } @@ -942,7 +983,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; @@ -962,7 +1003,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); if (l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); ret = 0; } @@ -977,12 +1018,12 @@ static void delete_all_elements(struct bpf_htab *htab) int i; for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); - struct hlist_node *n; + struct hlist_nulls_head *head = select_bucket(htab, i); + struct hlist_nulls_node *n; struct htab_elem *l; - hlist_for_each_entry_safe(l, n, head, hash_node) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + hlist_nulls_del_rcu(&l->hash_node); if (l->state != HTAB_EXTRA_ELEM_USED) htab_elem_free(htab, l); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8bfe0afaee10..b37bd9ab7f57 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -500,9 +500,15 @@ unlock: raw_spin_unlock(&trie->lock); } +static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + static const struct bpf_map_ops trie_ops = { .map_alloc = trie_alloc, .map_free = trie_free, + .map_get_next_key = trie_get_next_key, .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 56eba9caa632..1dc22f6b49f5 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1329,7 +1329,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct task_struct *task; int count = 0; - seq_printf(seq, "css_set %p\n", cset); + seq_printf(seq, "css_set %pK\n", cset); list_for_each_entry(task, &cset->tasks, cg_list) { if (count++ > MAX_TASKS_SHOWN_PER_CSS) diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index e756dae49300..2237201d66d5 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -229,7 +229,7 @@ static int pids_can_fork(struct task_struct *task) /* Only log the first time events_limit is incremented. */ if (atomic64_inc_return(&pids->events_limit) == 1) { pr_info("cgroup: fork rejected by pids controller in "); - pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); + pr_cont_cgroup_path(css->cgroup); pr_cont("\n"); } cgroup_file_notify(&pids->events_file); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b56a558e406d..b118735fea9d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -614,13 +614,13 @@ static int kexec_calculate_store_digests(struct kimage *image) ret = crypto_shash_final(desc, digest); if (ret) goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha_regions", - sha_regions, sha_region_sz, 0); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", + sha_regions, sha_region_sz, 0); if (ret) goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); if (ret) goto out_free_digest; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 4cef7e4706b0..799a8a452187 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,11 +15,7 @@ int kimage_is_destination_range(struct kimage *image, extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE -struct kexec_sha_region { - unsigned long start; - unsigned long len; -}; - +#include <linux/purgatory.h> void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 072cbc9b175d..c0168b7da1ea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1507,6 +1507,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + WARN_ON_ONCE(!wq); WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); WARN_ON_ONCE(timer_pending(timer)); diff --git a/mm/Kconfig b/mm/Kconfig index 9b8fccb969dc..c89f472b658c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP bool config ARCH_DISCARD_MEMBLOCK @@ -1155,7 +1155,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic RCU Fast GUP + * Generic Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1176,8 +1176,8 @@ struct page *get_dump_page(unsigned long addr) * Before activating this code, please be aware that the following assumptions * are currently made: * - * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free - * pages containing page tables. + * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * @@ -1187,36 +1187,59 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifdef CONFIG_HAVE_GENERIC_GUP + +#ifndef gup_get_pte +/* + * We assume that the PTE can be read atomically. If this is not the case for + * your architecture, please provide the helper. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} +#endif + +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} #ifdef __HAVE_ARCH_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { + struct dev_pagemap *pgmap = NULL; + int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; - int ret = 0; ptem = ptep = pte_offset_map(&pmd, addr); do { - /* - * In the line below we are assuming that the pte can be read - * atomically. If this is not the case for your architecture, - * please wrap this in a helper function! - * - * for an example see gup_get_pte in arch/x86/mm/gup.c - */ - pte_t pte = READ_ONCE(*ptep); + pte_t pte = gup_get_pte(ptep); struct page *head, *page; /* * Similar to the PMD case below, NUMA hinting must take slow * path using the pte_protnone check. */ - if (!pte_present(pte) || pte_special(pte) || - pte_protnone(pte) || (write && !pte_write(pte))) + if (pte_protnone(pte)) goto pte_unmap; - if (!arch_pte_access_permitted(pte, write)) + if (!pte_access_permitted(pte, write)) + goto pte_unmap; + + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + goto pte_unmap; + } + } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } VM_BUG_ON_PAGE(compound_head(page) != head, page); + + put_dev_pagemap(pgmap); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* __HAVE_ARCH_PTE_SPECIAL */ +#ifdef __HAVE_ARCH_PTE_DEVMAP +static int __gup_device_huge(unsigned long pfn, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + int nr_start = *nr; + struct dev_pagemap *pgmap = NULL; + + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + SetPageReferenced(page); + pages[*nr] = page; + get_page(page); + put_dev_pagemap(pgmap); + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + return 1; +} + +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} + +static int __gup_device_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} +#else +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + BUILD_BUG(); + return 0; +} + +static int __gup_device_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + BUILD_BUG(); + return 0; +} +#endif + static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (write && !pmd_write(orig)) + if (!pmd_access_permitted(orig, write)) return 0; + if (pmd_devmap(orig)) + return __gup_device_huge_pmd(orig, addr, end, pages, nr); + refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } @@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, struct page *head, *page; int refs; - if (write && !pud_write(orig)) + if (!pud_access_permitted(orig, write)) return 0; + if (pud_devmap(orig)) + return __gup_device_huge_pud(orig, addr, end, pages, nr); + refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); @@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } @@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, int refs; struct page *head, *page; - if (write && !pgd_write(orig)) + if (!pgd_access_permitted(orig, write)) return 0; + BUILD_BUG_ON(pgd_devmap(orig)); refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); @@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } @@ -1455,7 +1549,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, write, pages, nr)) return 0; - } else if (!gup_p4d_range(p4d, addr, next, write, pages, nr)) + } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } +#ifndef gup_fast_permitted +/* + * Check if it's allowed to use __get_user_pages_fast() for the range, or + * we need to fall back to the slow version: + */ +bool gup_fast_permitted(unsigned long start, int nr_pages, int write) +{ + unsigned long len, end; + + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + return end >= start; +} +#endif + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - int nr, ret; + int nr = 0, ret = 0; start &= PAGE_MASK; - nr = __get_user_pages_fast(start, nr_pages, write, pages); - ret = nr; + + if (gup_fast_permitted(start, nr_pages, write)) { + nr = __get_user_pages_fast(start, nr_pages, write, pages); + ret = nr; + } if (nr < nr_pages) { /* Try to get the remaining pages with get_user_pages */ @@ -1565,4 +1677,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, return ret; } -#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ +#endif /* CONFIG_HAVE_GENERIC_GUP */ diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 538998a137d2..9ac639499bd1 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -21,7 +21,6 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, /** * pcpu_get_pages - get temp pages array - * @chunk: chunk of interest * * Returns pointer to array of pointers to struct page which can be indexed * with pcpu_page_idx(). Note that there is only one array and accesses @@ -30,7 +29,7 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, * RETURNS: * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) +static struct page **pcpu_get_pages(void) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); @@ -275,7 +274,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, { struct page **pages; - pages = pcpu_get_pages(chunk); + pages = pcpu_get_pages(); if (!pages) return -ENOMEM; @@ -313,7 +312,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages(chunk); + pages = pcpu_get_pages(); BUG_ON(!pages); /* unmap and free */ diff --git a/mm/percpu.c b/mm/percpu.c index 5696039b5c07..60a6488e9e6d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1011,8 +1011,11 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); } - if (chunk != pcpu_reserved_chunk) + if (chunk != pcpu_reserved_chunk) { + spin_lock_irqsave(&pcpu_lock, flags); pcpu_nr_empty_pop_pages -= occ_pages; + spin_unlock_irqrestore(&pcpu_lock, flags); + } if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); diff --git a/net/atm/svc.c b/net/atm/svc.c index db9794ec61d8..5589de7086af 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -318,7 +318,8 @@ out: return error; } -static int svc_accept(struct socket *sock, struct socket *newsock, int flags) +static int svc_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *sk = sock->sk; struct sk_buff *skb; @@ -329,7 +330,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags) lock_sock(sk); - error = svc_create(sock_net(sk), newsock, 0, 0); + error = svc_create(sock_net(sk), newsock, 0, kern); if (error) goto out; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index a8e42cedf1db..b7c486752b3a 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1320,7 +1320,8 @@ out_release: return err; } -static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) +static int ax25_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sk_buff *skb; struct sock *newsk; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index f307b145ea54..507b80d59dec 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -301,7 +301,7 @@ done: } static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, - int flags) + int flags, bool kern) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *nsk; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index aa1a814ceddc..ac3c650cb234 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -471,7 +471,8 @@ done: return err; } -static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags) +static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *nsk; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index e4e9a2da1e7e..728e0c8dc8e7 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -627,7 +627,7 @@ done: } static int sco_sock_accept(struct socket *sock, struct socket *newsock, - int flags) + int flags, bool kern) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 236f34244dbe..013f2290bfa5 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -30,6 +30,7 @@ EXPORT_SYMBOL(br_should_route_hook); static int br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { + br_drop_fake_rtable(skb); return netif_receive_skb(skb); } diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 95087e6e8258..fa87fbd62bb7 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -521,21 +521,6 @@ static unsigned int br_nf_pre_routing(void *priv, } -/* PF_BRIDGE/LOCAL_IN ************************************************/ -/* The packet is locally destined, which requires a real - * dst_entry, so detach the fake one. On the way up, the - * packet would pass through PRE_ROUTING again (which already - * took place when the packet entered the bridge), but we - * register an IPv4 PRE_ROUTING 'sabotage' hook that will - * prevent this from happening. */ -static unsigned int br_nf_local_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - br_drop_fake_rtable(skb); - return NF_ACCEPT; -} - /* PF_BRIDGE/FORWARD *************************************************/ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -908,12 +893,6 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = { .priority = NF_BR_PRI_BRNF, }, { - .hook = br_nf_local_in, - .pf = NFPROTO_BRIDGE, - .hooknum = NF_BR_LOCAL_IN, - .priority = NF_BR_PRI_BRNF, - }, - { .hook = br_nf_forward_ip, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, diff --git a/net/core/dev.c b/net/core/dev.c index 8637b2b71f3d..7869ae3837ca 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1304,6 +1304,7 @@ void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 3945821e9c1f..65ea0ff4017c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -953,7 +953,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!list_empty(&dev_net(dev)->exit_list)) + if (!atomic_read(&dev_net(dev)->count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1371,7 +1371,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!list_empty(&dev_net(dev)->exit_list)) + if (!atomic_read(&dev_net(dev)->count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1558,7 +1558,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &(ndev->dev); - if (!list_empty(&dev_net(ndev)->exit_list)) + if (!atomic_read(&dev_net(ndev)->count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f3557958e9bf..cd4ba8c6b609 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3828,13 +3828,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, if (!skb_may_tx_timestamp(sk, false)) return; - /* take a reference to prevent skb_orphan() from freeing the socket */ - sock_hold(sk); - - *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); - - sock_put(sk); + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); + sock_put(sk); + } } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); @@ -3893,7 +3894,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; struct sock_exterr_skb *serr; - int err; + int err = 1; skb->wifi_acked_valid = 1; skb->wifi_acked = acked; @@ -3903,14 +3904,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; - /* take a reference to prevent skb_orphan() from freeing the socket */ - sock_hold(sk); - - err = sock_queue_err_skb(sk, skb); + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { + err = sock_queue_err_skb(sk, skb); + sock_put(sk); + } if (err) kfree_skb(skb); - - sock_put(sk); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); diff --git a/net/core/sock.c b/net/core/sock.c index f6fd79f33097..a96d5f7a5734 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -197,66 +197,55 @@ EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have - * one slock key per address family: + * one slock key per address family and separate keys for internal and + * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; +static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; +static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ + +#define _sock_locks(x) \ + x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ + x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ + x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ + x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ + x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ + x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ + x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ + x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ + x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ + x "27" , x "28" , x "AF_CAN" , \ + x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ + x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ + x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ + x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ + x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" + static const char *const af_family_key_strings[AF_MAX+1] = { - "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , - "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", - "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , - "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , - "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , - "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , - "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , - "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , - "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , - "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , - "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , - "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , - "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , - "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , - "sk_lock-AF_QIPCRTR", "sk_lock-AF_SMC" , "sk_lock-AF_MAX" + _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { - "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , - "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", - "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , - "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , - "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , - "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , - "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , - "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , - "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , - "slock-27" , "slock-28" , "slock-AF_CAN" , - "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , - "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , - "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , - "slock-AF_QIPCRTR", "slock-AF_SMC" , "slock-AF_MAX" + _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { - "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , - "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", - "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , - "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , - "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , - "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , - "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , - "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , - "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , - "clock-27" , "clock-28" , "clock-AF_CAN" , - "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , - "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , - "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , - "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , - "clock-AF_QIPCRTR", "clock-AF_SMC" , "clock-AF_MAX" + _sock_locks("clock-") +}; + +static const char *const af_family_kern_key_strings[AF_MAX+1] = { + _sock_locks("k-sk_lock-") +}; +static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { + _sock_locks("k-slock-") +}; +static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { + _sock_locks("k-clock-") }; /* @@ -264,6 +253,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; +static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across @@ -1293,7 +1283,16 @@ lenout: */ static inline void sock_lock_init(struct sock *sk) { - sock_lock_init_class_and_name(sk, + if (sk->sk_kern_sock) + sock_lock_init_class_and_name( + sk, + af_family_kern_slock_key_strings[sk->sk_family], + af_family_kern_slock_keys + sk->sk_family, + af_family_kern_key_strings[sk->sk_family], + af_family_kern_keys + sk->sk_family); + else + sock_lock_init_class_and_name( + sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], @@ -1399,6 +1398,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) @@ -2277,7 +2277,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) } EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { return -EOPNOTSUPP; } @@ -2481,7 +2482,14 @@ void sock_init_data(struct socket *sock, struct sock *sk) } rwlock_init(&sk->sk_callback_lock); - lockdep_set_class_and_name(&sk->sk_callback_lock, + if (sk->sk_kern_sock) + lockdep_set_class_and_name( + &sk->sk_callback_lock, + af_kern_callback_keys + sk->sk_family, + af_family_kern_clock_key_strings[sk->sk_family]); + else + lockdep_set_class_and_name( + &sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index f053198e730c..5e3a7302f774 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -749,6 +749,7 @@ static void ccid2_hc_tx_exit(struct sock *sk) for (i = 0; i < hc->tx_seqbufc; i++) kfree(hc->tx_seqbuf[i]); hc->tx_seqbufc = 0; + dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); } static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 409d0cfd3447..b99168b0fabf 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -289,7 +289,8 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) switch (type) { case ICMP_REDIRECT: - dccp_do_redirect(skb, sk); + if (!sock_owned_by_user(sk)) + dccp_do_redirect(skb, sk); goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 233b57367758..d9b6a4e403e7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -122,10 +122,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, np = inet6_sk(sk); if (type == NDISC_REDIRECT) { - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); + if (!sock_owned_by_user(sk)) { + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - if (dst) - dst->ops->redirect(dst, sk, skb); + if (dst) + dst->ops->redirect(dst, sk, skb); + } goto out; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index e267e6f4c9a5..abd07a443219 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -142,6 +142,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, struct dccp_request_sock *dreq = dccp_rsk(req); bool own_req; + /* TCP/DCCP listeners became lockless. + * DCCP stores complex state in its request_sock, so we need + * a protection for them, now this code runs without being protected + * by the parent (listener) lock. + */ + spin_lock_bh(&dreq->dreq_lock); + /* Check for retransmitted REQUEST */ if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { @@ -156,7 +163,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, inet_rtx_syn_ack(sk, req); } /* Network Duplicate, discard packet */ - return NULL; + goto out; } DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; @@ -182,20 +189,20 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); - if (!child) - goto listen_overflow; - - return inet_csk_complete_hashdance(sk, child, req, own_req); + if (child) { + child = inet_csk_complete_hashdance(sk, child, req, own_req); + goto out; + } -listen_overflow: - dccp_pr_debug("listen_overflow!\n"); DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; drop: if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) req->rsk_ops->send_reset(sk, skb); inet_csk_reqsk_queue_drop(sk, req); - return NULL; +out: + spin_unlock_bh(&dreq->dreq_lock); + return child; } EXPORT_SYMBOL_GPL(dccp_check_req); @@ -246,6 +253,7 @@ int dccp_reqsk_init(struct request_sock *req, { struct dccp_request_sock *dreq = dccp_rsk(req); + spin_lock_init(&dreq->dreq_lock); inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); inet_rsk(req)->acked = 0; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index e6e79eda9763..7de5b40a5d0d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1070,7 +1070,8 @@ static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo) return skb == NULL ? ERR_PTR(err) : skb; } -static int dn_accept(struct socket *sock, struct socket *newsock, int flags) +static int dn_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *sk = sock->sk, *newsk; struct sk_buff *skb = NULL; @@ -1099,7 +1100,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags) cb = DN_SKB_CB(skb); sk->sk_ack_backlog--; - newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0); + newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern); if (newsk == NULL) { release_sock(sk); kfree_skb(skb); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 602d40f43687..6b1fc6e4278e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -689,11 +689,12 @@ EXPORT_SYMBOL(inet_stream_connect); * Accept a pending connection. The TCP layer now gives BSD semantics. */ -int inet_accept(struct socket *sock, struct socket *newsock, int flags) +int inet_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *sk1 = sock->sk; int err = -EINVAL; - struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); + struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); if (!sk2) goto do_err; @@ -1487,8 +1488,10 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) int proto = iph->protocol; int err = -ENOSYS; - if (skb->encapsulation) + if (skb->encapsulation) { + skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP)); skb_set_inner_network_header(skb, nhoff); + } csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b4d5980ade3b..5e313c1ac94f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -424,7 +424,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) /* * This will accept the next outstanding connection. */ -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 737ce826d7ec..7a3fd25e8913 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -966,7 +966,7 @@ static int __ip_append_data(struct sock *sk, cork->length += length; if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && + (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9a89b8deafae..575e19dcc017 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -279,10 +279,13 @@ EXPORT_SYMBOL(tcp_v4_connect); */ void tcp_v4_mtu_reduced(struct sock *sk) { - struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); - u32 mtu = tcp_sk(sk)->mtu_info; + struct dst_entry *dst; + u32 mtu; + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + return; + mtu = tcp_sk(sk)->mtu_info; dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -428,7 +431,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) switch (type) { case ICMP_REDIRECT: - do_redirect(icmp_skb, sk); + if (!sock_owned_by_user(sk)) + do_redirect(icmp_skb, sk); goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 40d893556e67..b2ab411c6d37 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -249,7 +249,8 @@ void tcp_delack_timer_handler(struct sock *sk) sk_mem_reclaim_partial(sk); - if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || + !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; if (time_after(icsk->icsk_ack.timeout, jiffies)) { @@ -552,7 +553,8 @@ void tcp_write_timer_handler(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); int event; - if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || + !icsk->icsk_pending) goto out; if (time_after(icsk->icsk_timeout, jiffies)) { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 04db40620ea6..a9a9553ee63d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -920,12 +920,12 @@ static int __init inet6_init(void) err = register_pernet_subsys(&inet6_net_ops); if (err) goto register_pernet_fail; - err = icmpv6_init(); - if (err) - goto icmp_fail; err = ip6_mr_init(); if (err) goto ipmr_fail; + err = icmpv6_init(); + if (err) + goto icmp_fail; err = ndisc_init(); if (err) goto ndisc_fail; @@ -1061,10 +1061,10 @@ igmp_fail: ndisc_cleanup(); ndisc_fail: ip6_mr_cleanup(); -ipmr_fail: - icmpv6_cleanup(); icmp_fail: unregister_pernet_subsys(&inet6_net_ops); +ipmr_fail: + icmpv6_cleanup(); register_pernet_fail: sock_unregister(PF_INET6); rtnl_unregister_all(PF_INET6); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e4266746e4a2..d4bf2c68a545 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -923,6 +923,8 @@ add: ins = &rt->dst.rt6_next; iter = *ins; while (iter) { + if (iter->rt6i_metric > rt->rt6i_metric) + break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; fib6_purge_rt(iter, fn, info->nl_net); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 0838e6d01d2e..93e58a5e1837 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -294,8 +294,10 @@ static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); int err = -ENOSYS; - if (skb->encapsulation) + if (skb->encapsulation) { + skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); + } iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 528b3c1f3fde..58f6288e9ba5 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -768,13 +768,14 @@ slow_path: * Fragment the datagram. */ - *prevhdr = NEXTHDR_FRAGMENT; troom = rt->dst.dev->needed_tailroom; /* * Keep copying data until we run out. */ while (left > 0) { + u8 *fragnexthdr_offset; + len = left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) @@ -819,6 +820,10 @@ slow_path: */ skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); + fragnexthdr_offset = skb_network_header(frag); + fragnexthdr_offset += prevhdr - skb_network_header(skb); + *fragnexthdr_offset = NEXTHDR_FRAGMENT; + /* * Build fragment header. */ @@ -1385,7 +1390,7 @@ emsgsize: if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && + (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, exthdrlen, diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 644ba59fbd9d..3d8a3b63b4fd 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -485,11 +485,15 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (!skb->ignore_df && skb->len > mtu) { skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); - if (skb->protocol == htons(ETH_P_IPV6)) + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - else + } else { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + } return -EMSGSIZE; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 229bfcc451ef..35c58b669ebd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3299,7 +3299,6 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ - + nla_total_size(4) /* RTA_OIF */ + lwtunnel_get_encap_size(rt->dst.lwtstate); nexthop_len *= rt->rt6i_nsiblings; @@ -3323,7 +3322,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) } static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, - unsigned int *flags) + unsigned int *flags, bool skip_oif) { if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { *flags |= RTNH_F_LINKDOWN; @@ -3336,7 +3335,8 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, goto nla_put_failure; } - if (rt->dst.dev && + /* not needed for multipath encoding b/c it has a rtnexthop struct */ + if (!skip_oif && rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; @@ -3350,6 +3350,7 @@ nla_put_failure: return -EMSGSIZE; } +/* add multipath next hop */ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) { struct rtnexthop *rtnh; @@ -3362,7 +3363,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) rtnh->rtnh_hops = 0; rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; - if (rt6_nexthop_info(skb, rt, &flags) < 0) + if (rt6_nexthop_info(skb, rt, &flags, true) < 0) goto nla_put_failure; rtnh->rtnh_flags = flags; @@ -3515,7 +3516,7 @@ static int rt6_fill_node(struct net *net, nla_nest_end(skb, mp); } else { - if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags) < 0) + if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) goto nla_put_failure; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 60a5295a7de6..49fa2e8c3fa9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -391,10 +391,12 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, np = inet6_sk(sk); if (type == NDISC_REDIRECT) { - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); + if (!sock_owned_by_user(sk)) { + struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - if (dst) - dst->ops->redirect(dst, sk, skb); + if (dst) + dst->ops->redirect(dst, sk, skb); + } goto out; } diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 81adc29a448d..8d77ad5cadaf 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -828,7 +828,8 @@ out: * Wait for incoming connection * */ -static int irda_accept(struct socket *sock, struct socket *newsock, int flags) +static int irda_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *sk = sock->sk; struct irda_sock *new, *self = irda_sk(sk); @@ -836,7 +837,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) struct sk_buff *skb = NULL; int err; - err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); + err = irda_create(sock_net(sk), newsock, sk->sk_protocol, kern); if (err) return err; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 89bbde1081ce..84de7b6326dc 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -938,7 +938,7 @@ done: /* Accept a pending connection */ static int iucv_sock_accept(struct socket *sock, struct socket *newsock, - int flags) + int flags, bool kern) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *nsk; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 06186d608a27..cb4fff785cbf 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -641,11 +641,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) * @sock: Socket which connections arrive on. * @newsock: Socket to move incoming connection to. * @flags: User specified operational flags. + * @kern: If the socket is kernel internal * * Accept a new incoming connection. * Returns 0 upon success, negative otherwise. */ -static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags) +static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *sk = sock->sk, *newsk; struct llc_sock *llc, *newllc; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 3818686182b2..33211f9a2656 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1288,7 +1288,8 @@ static void mpls_ifdown(struct net_device *dev, int event) /* fall through */ case NETDEV_CHANGE: nh->nh_flags |= RTNH_F_LINKDOWN; - ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1; + if (event != NETDEV_UNREGISTER) + ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1; break; } if (event == NETDEV_UNREGISTER) @@ -2028,6 +2029,7 @@ static void mpls_net_exit(struct net *net) for (index = 0; index < platform_labels; index++) { struct mpls_route *rt = rtnl_dereference(platform_label[index]); RCU_INIT_POINTER(platform_label[index], NULL); + mpls_notify_route(net, index, rt, NULL, NULL); mpls_rt_free(rt); } rtnl_unlock(); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 4bbf4526b885..ebf16f7f9089 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -765,7 +765,8 @@ out_release: return err; } -static int nr_accept(struct socket *sock, struct socket *newsock, int flags) +static int nr_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sk_buff *skb; struct sock *newsk; diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 879885b31cce..2ffb18e73df6 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -441,7 +441,7 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent, } static int llcp_sock_accept(struct socket *sock, struct socket *newsock, - int flags) + int flags, bool kern) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *new_sk; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 222bedcd9575..e81537991ddf 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -772,7 +772,8 @@ static void pep_sock_close(struct sock *sk, long timeout) sock_put(sk); } -static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) +static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, + bool kern) { struct pep_sock *pn = pep_sk(sk), *newpn; struct sock *newsk = NULL; @@ -846,7 +847,8 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) } /* Create a new to-be-accepted sock */ - newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0); + newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, + kern); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index a6c8da3ee893..64634e3ec2fc 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -305,7 +305,7 @@ out: } static int pn_socket_accept(struct socket *sock, struct socket *newsock, - int flags) + int flags, bool kern) { struct sock *sk = sock->sk; struct sock *newsk; @@ -314,7 +314,7 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock, if (unlikely(sk->sk_state != TCP_LISTEN)) return -EINVAL; - newsk = sk->sk_prot->accept(sk, flags, &err); + newsk = sk->sk_prot->accept(sk, flags, &err, kern); if (!newsk) return err; diff --git a/net/rds/connection.c b/net/rds/connection.c index 0e04dcceb1d4..1fa75ab7b733 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -429,6 +429,7 @@ void rds_conn_destroy(struct rds_connection *conn) */ rds_cong_remove_conn(conn); + put_net(conn->c_net); kmem_cache_free(rds_conn_slab, conn); spin_lock_irqsave(&rds_conn_lock, flags); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index ce3775abc6e7..1c38d2c7caa8 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -442,7 +442,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_cq = NULL; ibdev_put_vector(rds_ibdev, ic->i_scq_vector); rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; + goto rds_ibdev_out; } ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev); @@ -456,19 +456,19 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_recv_cq = NULL; ibdev_put_vector(rds_ibdev, ic->i_rcq_vector); rdsdebug("ib_create_cq recv failed: %d\n", ret); - goto out; + goto send_cq_out; } ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); if (ret) { rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - goto out; + goto recv_cq_out; } ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); if (ret) { rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); - goto out; + goto recv_cq_out; } /* XXX negotiate max send/recv with remote? */ @@ -494,7 +494,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); if (ret) { rdsdebug("rdma_create_qp failed: %d\n", ret); - goto out; + goto recv_cq_out; } ic->i_send_hdrs = ib_dma_alloc_coherent(dev, @@ -504,7 +504,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); - goto out; + goto qp_out; } ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, @@ -514,7 +514,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); - goto out; + goto send_hdrs_dma_out; } ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), @@ -522,7 +522,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); - goto out; + goto recv_hdrs_dma_out; } ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), @@ -530,7 +530,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); - goto out; + goto ack_dma_out; } ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), @@ -538,7 +538,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); - goto out; + goto sends_out; } rds_ib_recv_init_ack(ic); @@ -546,8 +546,33 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, ic->i_send_cq, ic->i_recv_cq); -out: + return ret; + +sends_out: + vfree(ic->i_sends); +ack_dma_out: + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); +recv_hdrs_dma_out: + ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, ic->i_recv_hdrs_dma); +send_hdrs_dma_out: + ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, ic->i_send_hdrs_dma); +qp_out: + rdma_destroy_qp(ic->i_cm_id); +recv_cq_out: + if (!ib_destroy_cq(ic->i_recv_cq)) + ic->i_recv_cq = NULL; +send_cq_out: + if (!ib_destroy_cq(ic->i_send_cq)) + ic->i_send_cq = NULL; +rds_ibdev_out: + rds_ib_remove_conn(rds_ibdev, conn); rds_ib_dev_put(rds_ibdev); + return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index 39518ef7af4d..82d38ccf5e8b 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -147,7 +147,7 @@ struct rds_connection { /* Protocol version */ unsigned int c_version; - possible_net_t c_net; + struct net *c_net; struct list_head c_map_item; unsigned long c_map_queued; @@ -162,13 +162,13 @@ struct rds_connection { static inline struct net *rds_conn_net(struct rds_connection *conn) { - return read_pnet(&conn->c_net); + return conn->c_net; } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { - write_pnet(&conn->c_net, net); + conn->c_net = get_net(net); } #define RDS_FLAG_CONG_BITMAP 0x01 diff --git a/net/rds/tcp.c b/net/rds/tcp.c index a973d3b4dff0..225690076773 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -484,9 +484,10 @@ static void __net_exit rds_tcp_exit_net(struct net *net) * we do need to clean up the listen socket here. */ if (rtn->rds_tcp_listen_sock) { - rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); + struct socket *lsock = rtn->rds_tcp_listen_sock; + rtn->rds_tcp_listen_sock = NULL; - flush_work(&rtn->rds_tcp_accept_w); + rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); } } @@ -523,13 +524,13 @@ static void rds_tcp_kill_sock(struct net *net) struct rds_tcp_connection *tc, *_tc; LIST_HEAD(tmp_list); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct socket *lsock = rtn->rds_tcp_listen_sock; - rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); rtn->rds_tcp_listen_sock = NULL; - flush_work(&rtn->rds_tcp_accept_w); + rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); + struct net *c_net = tc->t_cpath->cp_conn->c_net; if (net != c_net || !tc->t_sock) continue; @@ -546,8 +547,12 @@ static void rds_tcp_kill_sock(struct net *net) void *rds_tcp_listen_sock_def_readable(struct net *net) { struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct socket *lsock = rtn->rds_tcp_listen_sock; + + if (!lsock) + return NULL; - return rtn->rds_tcp_listen_sock->sk->sk_user_data; + return lsock->sk->sk_user_data; } static int rds_tcp_dev_event(struct notifier_block *this, @@ -584,7 +589,7 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { - struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); + struct net *c_net = tc->t_cpath->cp_conn->c_net; if (net != c_net || !tc->t_sock) continue; @@ -638,19 +643,19 @@ static int rds_tcp_init(void) goto out; } - ret = register_netdevice_notifier(&rds_tcp_dev_notifier); - if (ret) { - pr_warn("could not register rds_tcp_dev_notifier\n"); + ret = rds_tcp_recv_init(); + if (ret) goto out_slab; - } ret = register_pernet_subsys(&rds_tcp_net_ops); if (ret) - goto out_notifier; + goto out_recv; - ret = rds_tcp_recv_init(); - if (ret) + ret = register_netdevice_notifier(&rds_tcp_dev_notifier); + if (ret) { + pr_warn("could not register rds_tcp_dev_notifier\n"); goto out_pernet; + } rds_trans_register(&rds_tcp_transport); @@ -660,9 +665,8 @@ static int rds_tcp_init(void) out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); -out_notifier: - if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) - pr_warn("could not unregister rds_tcp_dev_notifier\n"); +out_recv: + rds_tcp_recv_exit(); out_slab: kmem_cache_destroy(rds_tcp_conn_slab); out: diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 9a1cc8906576..56ea6620fcf9 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -66,7 +66,7 @@ void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ struct socket *rds_tcp_listen_init(struct net *); -void rds_tcp_listen_stop(struct socket *); +void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); int rds_tcp_keepalive(struct socket *sock); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 67d0929c7d3d..507678853e6c 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -133,7 +133,7 @@ int rds_tcp_accept_one(struct socket *sock) new_sock->type = sock->type; new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true); if (ret < 0) goto out; @@ -223,6 +223,9 @@ void rds_tcp_listen_data_ready(struct sock *sk) * before it has been accepted and the accepter has set up their * data_ready.. we only want to queue listen work for our listening * socket + * + * (*ready)() may be null if we are racing with netns delete, and + * the listen socket is being torn down. */ if (sk->sk_state == TCP_LISTEN) rds_tcp_accept_work(sk); @@ -231,7 +234,8 @@ void rds_tcp_listen_data_ready(struct sock *sk) out: read_unlock_bh(&sk->sk_callback_lock); - ready(sk); + if (ready) + ready(sk); } struct socket *rds_tcp_listen_init(struct net *net) @@ -271,7 +275,7 @@ out: return NULL; } -void rds_tcp_listen_stop(struct socket *sock) +void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor) { struct sock *sk; @@ -292,5 +296,6 @@ void rds_tcp_listen_stop(struct socket *sock) /* wait for accepts to stop and close the socket */ flush_workqueue(rds_wq); + flush_work(acceptor); sock_release(sock); } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index b8a1df2c9785..4a9729257023 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -871,7 +871,8 @@ out_release: return err; } -static int rose_accept(struct socket *sock, struct socket *newsock, int flags) +static int rose_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sk_buff *skb; struct sock *newsk; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 9f4cfa25af7c..18b2ad8be8e2 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -420,6 +420,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, u16 skew) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + enum rxrpc_call_state state; unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int ix; rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; @@ -434,14 +435,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, _proto("Rx DATA %%%u { #%u f=%02x }", sp->hdr.serial, seq, sp->hdr.flags); - if (call->state >= RXRPC_CALL_COMPLETE) + state = READ_ONCE(call->state); + if (state >= RXRPC_CALL_COMPLETE) return; /* Received data implicitly ACKs all of the request packets we sent * when we're acting as a client. */ - if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST || - call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && + if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST || + state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && !rxrpc_receiving_reply(call)) return; @@ -650,6 +652,7 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_peer *peer; unsigned int mtu; + bool wake = false; u32 rwind = ntohl(ackinfo->rwind); _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", @@ -657,9 +660,14 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), rwind, ntohl(ackinfo->jumbo_max)); - if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) - rwind = RXRPC_RXTX_BUFF_SIZE - 1; - call->tx_winsize = rwind; + if (call->tx_winsize != rwind) { + if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) + rwind = RXRPC_RXTX_BUFF_SIZE - 1; + if (rwind > call->tx_winsize) + wake = true; + call->tx_winsize = rwind; + } + if (call->cong_ssthresh > rwind) call->cong_ssthresh = rwind; @@ -673,6 +681,9 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, spin_unlock_bh(&peer->lock); _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); } + + if (wake) + wake_up(&call->waitq); } /* @@ -799,7 +810,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, return rxrpc_proto_abort("AK0", call, 0); /* Ignore ACKs unless we are or have just been transmitting. */ - switch (call->state) { + switch (READ_ONCE(call->state)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: case RXRPC_CALL_CLIENT_AWAIT_REPLY: case RXRPC_CALL_SERVER_SEND_REPLY: @@ -940,7 +951,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, struct rxrpc_call *call) { - switch (call->state) { + switch (READ_ONCE(call->state)) { case RXRPC_CALL_SERVER_AWAIT_ACK: rxrpc_call_completed(call); break; diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 6491ca46a03f..3e2f1a8e9c5b 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -527,7 +527,7 @@ try_again: msg->msg_namelen = len; } - switch (call->state) { + switch (READ_ONCE(call->state)) { case RXRPC_CALL_SERVER_ACCEPTING: ret = rxrpc_recvmsg_new_call(rx, call, msg, flags); break; @@ -640,7 +640,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, mutex_lock(&call->user_mutex); - switch (call->state) { + switch (READ_ONCE(call->state)) { case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index bc2d3dcff9de..97ab214ca411 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -488,6 +488,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) { + enum rxrpc_call_state state; enum rxrpc_command cmd; struct rxrpc_call *call; unsigned long user_call_ID = 0; @@ -526,13 +527,17 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) return PTR_ERR(call); /* ... and we have the call lock. */ } else { - ret = -EBUSY; - if (call->state == RXRPC_CALL_UNINITIALISED || - call->state == RXRPC_CALL_CLIENT_AWAIT_CONN || - call->state == RXRPC_CALL_SERVER_PREALLOC || - call->state == RXRPC_CALL_SERVER_SECURING || - call->state == RXRPC_CALL_SERVER_ACCEPTING) + switch (READ_ONCE(call->state)) { + case RXRPC_CALL_UNINITIALISED: + case RXRPC_CALL_CLIENT_AWAIT_CONN: + case RXRPC_CALL_SERVER_PREALLOC: + case RXRPC_CALL_SERVER_SECURING: + case RXRPC_CALL_SERVER_ACCEPTING: + ret = -EBUSY; goto error_release_sock; + default: + break; + } ret = mutex_lock_interruptible(&call->user_mutex); release_sock(&rx->sk); @@ -542,10 +547,11 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } } + state = READ_ONCE(call->state); _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); + call->debug_id, call->user_call_ID, state, call->conn); - if (call->state >= RXRPC_CALL_COMPLETE) { + if (state >= RXRPC_CALL_COMPLETE) { /* it's too late for this call */ ret = -ESHUTDOWN; } else if (cmd == RXRPC_CMD_SEND_ABORT) { @@ -555,12 +561,12 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } else if (cmd != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; } else if (rxrpc_is_client_call(call) && - call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { + state != RXRPC_CALL_CLIENT_SEND_REQUEST) { /* request phase complete for this client call */ ret = -EPROTO; } else if (rxrpc_is_service_call(call) && - call->state != RXRPC_CALL_SERVER_ACK_REQUEST && - call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + state != RXRPC_CALL_SERVER_ACK_REQUEST && + state != RXRPC_CALL_SERVER_SEND_REPLY) { /* Reply phase not begun or not complete for service call. */ ret = -EPROTO; } else { @@ -605,14 +611,21 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, _debug("CALL %d USR %lx ST %d on CONN %p", call->debug_id, call->user_call_ID, call->state, call->conn); - if (call->state >= RXRPC_CALL_COMPLETE) { - ret = -ESHUTDOWN; /* it's too late for this call */ - } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && - call->state != RXRPC_CALL_SERVER_ACK_REQUEST && - call->state != RXRPC_CALL_SERVER_SEND_REPLY) { - ret = -EPROTO; /* request phase complete for this client call */ - } else { + switch (READ_ONCE(call->state)) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + case RXRPC_CALL_SERVER_SEND_REPLY: ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); + break; + case RXRPC_CALL_COMPLETE: + read_lock_bh(&call->state_lock); + ret = -call->error; + read_unlock_bh(&call->state_lock); + break; + default: + /* Request phase complete for this client call */ + ret = -EPROTO; + break; } mutex_unlock(&call->user_mutex); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index ab8062909962..f9bb43c25697 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -113,6 +113,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (ret < 0) return ret; + if (!tb[TCA_CONNMARK_PARMS]) + return -EINVAL; + parm = nla_data(tb[TCA_CONNMARK_PARMS]); if (!tcf_hash_check(tn, parm->index, a, bind)) { diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 3b7074e23024..c736627f8f4a 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -228,7 +228,6 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, return skb->len; nla_put_failure: - rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 063baac5b9fe..961ee59f696a 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -640,14 +640,15 @@ static sctp_scope_t sctp_v6_scope(union sctp_addr *addr) /* Create and initialize a new sk for the socket to be returned by accept(). */ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, - struct sctp_association *asoc) + struct sctp_association *asoc, + bool kern) { struct sock *newsk; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; struct ipv6_txoptions *opt; - newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0); + newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern); if (!newsk) goto out; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 1b6d4574d2b0..989a900383b5 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -575,10 +575,11 @@ static int sctp_v4_is_ce(const struct sk_buff *skb) /* Create and initialize a new sk for the socket returned by accept(). */ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, - struct sctp_association *asoc) + struct sctp_association *asoc, + bool kern) { struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, - sk->sk_prot, 0); + sk->sk_prot, kern); struct inet_sock *newinet; if (!newsk) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6f0a9be50f50..0f378ea2ae38 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4116,7 +4116,7 @@ static int sctp_disconnect(struct sock *sk, int flags) * descriptor will be returned from accept() to represent the newly * formed association. */ -static struct sock *sctp_accept(struct sock *sk, int flags, int *err) +static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) { struct sctp_sock *sp; struct sctp_endpoint *ep; @@ -4151,7 +4151,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err) */ asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); - newsk = sp->pf->create_accept_sk(sk, asoc); + newsk = sp->pf->create_accept_sk(sk, asoc, kern); if (!newsk) { error = -ENOMEM; goto out; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 85837ab90e89..093803786eac 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -944,7 +944,7 @@ out: } static int smc_accept(struct socket *sock, struct socket *new_sock, - int flags) + int flags, bool kern) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); diff --git a/net/socket.c b/net/socket.c index 2c1e8677ff2d..e034fe4164be 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1506,7 +1506,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, if (err) goto out_fd; - err = sock->ops->accept(sock, newsock, sock->file->f_flags); + err = sock->ops->accept(sock, newsock, sock->file->f_flags, false); if (err < 0) goto out_fd; @@ -1731,6 +1731,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, /* We assume all kernel code knows the size of sockaddr_storage */ msg.msg_namelen = 0; msg.msg_iocb = NULL; + msg.msg_flags = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, flags); @@ -3238,7 +3239,7 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) if (err < 0) goto done; - err = sock->ops->accept(sock, *newsock, flags); + err = sock->ops->accept(sock, *newsock, flags, true); if (err < 0) { sock_release(*newsock); *newsock = NULL; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 43e4045e72bc..7130e73bd42c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -115,7 +115,8 @@ static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); -static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); +static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, + bool kern); static void tipc_sk_timeout(unsigned long data); static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); @@ -2029,7 +2030,8 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo) * * Returns 0 on success, errno otherwise */ -static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) +static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, + bool kern) { struct sock *new_sk, *sk = sock->sk; struct sk_buff *buf; @@ -2051,7 +2053,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) buf = skb_peek(&sk->sk_receive_queue); - res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 0); + res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern); if (res) goto exit; security_sk_clone(sock->sk, new_sock->sk); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ee37b390260a..928691c43408 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -636,7 +636,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int); static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); -static int unix_accept(struct socket *, struct socket *, int); +static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int *, int); static unsigned int unix_poll(struct file *, struct socket *, poll_table *); static unsigned int unix_dgram_poll(struct file *, struct socket *, @@ -1402,7 +1402,8 @@ static void unix_sock_inherit_flags(const struct socket *old, set_bit(SOCK_PASSSEC, &new->flags); } -static int unix_accept(struct socket *sock, struct socket *newsock, int flags) +static int unix_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *sk = sock->sk; struct sock *tsk; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9192ead66751..9f770f33c100 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1250,7 +1250,8 @@ out: return err; } -static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) +static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *listener; int err; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index fd28a49dbe8f..8b911c29860e 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -852,7 +852,8 @@ static int x25_wait_for_data(struct sock *sk, long timeout) return rc; } -static int x25_accept(struct socket *sock, struct socket *newsock, int flags) +static int x25_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { struct sock *sk = sock->sk; struct sock *newsk; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0806dccdf507..236cbbc0ab9c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1243,7 +1243,7 @@ static inline int policy_to_flow_dir(int dir) } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, - const struct flowi *fl) + const struct flowi *fl, u16 family) { struct xfrm_policy *pol; @@ -1251,8 +1251,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { - bool match = xfrm_selector_match(&pol->selector, fl, - sk->sk_family); + bool match = xfrm_selector_match(&pol->selector, fl, family); int err = 0; if (match) { @@ -2239,7 +2238,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; - pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) @@ -2518,7 +2517,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { - pol = xfrm_sk_policy_lookup(sk, dir, &fl); + pol = xfrm_sk_policy_lookup(sk, dir, &fl, family); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; @@ -3069,6 +3068,11 @@ static int __net_init xfrm_net_init(struct net *net) { int rv; + /* Initialize the per-net locks here */ + spin_lock_init(&net->xfrm.xfrm_state_lock); + spin_lock_init(&net->xfrm.xfrm_policy_lock); + mutex_init(&net->xfrm.xfrm_cfg_mutex); + rv = xfrm_statistics_init(net); if (rv < 0) goto out_statistics; @@ -3085,11 +3089,6 @@ static int __net_init xfrm_net_init(struct net *net) if (rv < 0) goto out; - /* Initialize the per-net locks here */ - spin_lock_init(&net->xfrm.xfrm_state_lock); - spin_lock_init(&net->xfrm.xfrm_policy_lock); - mutex_init(&net->xfrm.xfrm_cfg_mutex); - return 0; out: diff --git a/tools/include/uapi/linux/bpf_perf_event.h b/tools/include/uapi/linux/bpf_perf_event.h new file mode 100644 index 000000000000..067427259820 --- /dev/null +++ b/tools/include/uapi/linux/bpf_perf_event.h @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__ +#define _UAPI__LINUX_BPF_PERF_EVENT_H__ + +#include <linux/types.h> +#include <linux/ptrace.h> + +struct bpf_perf_event_data { + struct pt_regs regs; + __u64 sample_period; +}; + +#endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4b498265dae6..67531f47781b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -1,12 +1,14 @@ LIBDIR := ../../../lib BPFOBJ := $(LIBDIR)/bpf/bpf.o -CFLAGS += -Wall -O2 -lcap -I../../../include/uapi -I$(LIBDIR) +CFLAGS += -Wall -O2 -lcap -I../../../include/uapi -I$(LIBDIR) $(BPFOBJ) TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map TEST_PROGS := test_kmod.sh +all: $(TEST_GEN_PROGS) + .PHONY: all clean force # force a rebuild of BPFOBJ when its dependencies are updated diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index e1f5b9eea1e8..d1555e4240c0 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8,6 +8,8 @@ * License as published by the Free Software Foundation. */ +#include <asm/types.h> +#include <linux/types.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> @@ -4583,10 +4585,12 @@ static bool is_admin(void) cap_flag_value_t sysadmin = CAP_CLEAR; const cap_value_t cap_val = CAP_SYS_ADMIN; +#ifdef CAP_IS_SUPPORTED if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) { perror("cap_get_flag"); return false; } +#endif caps = cap_get_proc(); if (!caps) { perror("cap_get_proc"); diff --git a/tools/testing/selftests/powerpc/include/vsx_asm.h b/tools/testing/selftests/powerpc/include/vsx_asm.h index d828bfb6ef2d..54064ced9e95 100644 --- a/tools/testing/selftests/powerpc/include/vsx_asm.h +++ b/tools/testing/selftests/powerpc/include/vsx_asm.h @@ -16,56 +16,56 @@ */ FUNC_START(load_vsx) li r5,0 - lxvx vs20,r5,r3 + lxvd2x vs20,r5,r3 addi r5,r5,16 - lxvx vs21,r5,r3 + lxvd2x vs21,r5,r3 addi r5,r5,16 - lxvx vs22,r5,r3 + lxvd2x vs22,r5,r3 addi r5,r5,16 - lxvx vs23,r5,r3 + lxvd2x vs23,r5,r3 addi r5,r5,16 - lxvx vs24,r5,r3 + lxvd2x vs24,r5,r3 addi r5,r5,16 - lxvx vs25,r5,r3 + lxvd2x vs25,r5,r3 addi r5,r5,16 - lxvx vs26,r5,r3 + lxvd2x vs26,r5,r3 addi r5,r5,16 - lxvx vs27,r5,r3 + lxvd2x vs27,r5,r3 addi r5,r5,16 - lxvx vs28,r5,r3 + lxvd2x vs28,r5,r3 addi r5,r5,16 - lxvx vs29,r5,r3 + lxvd2x vs29,r5,r3 addi r5,r5,16 - lxvx vs30,r5,r3 + lxvd2x vs30,r5,r3 addi r5,r5,16 - lxvx vs31,r5,r3 + lxvd2x vs31,r5,r3 blr FUNC_END(load_vsx) FUNC_START(store_vsx) li r5,0 - stxvx vs20,r5,r3 + stxvd2x vs20,r5,r3 addi r5,r5,16 - stxvx vs21,r5,r3 + stxvd2x vs21,r5,r3 addi r5,r5,16 - stxvx vs22,r5,r3 + stxvd2x vs22,r5,r3 addi r5,r5,16 - stxvx vs23,r5,r3 + stxvd2x vs23,r5,r3 addi r5,r5,16 - stxvx vs24,r5,r3 + stxvd2x vs24,r5,r3 addi r5,r5,16 - stxvx vs25,r5,r3 + stxvd2x vs25,r5,r3 addi r5,r5,16 - stxvx vs26,r5,r3 + stxvd2x vs26,r5,r3 addi r5,r5,16 - stxvx vs27,r5,r3 + stxvd2x vs27,r5,r3 addi r5,r5,16 - stxvx vs28,r5,r3 + stxvd2x vs28,r5,r3 addi r5,r5,16 - stxvx vs29,r5,r3 + stxvd2x vs29,r5,r3 addi r5,r5,16 - stxvx vs30,r5,r3 + stxvd2x vs30,r5,r3 addi r5,r5,16 - stxvx vs31,r5,r3 + stxvd2x vs31,r5,r3 blr FUNC_END(store_vsx) diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index f6121612e769..b9a22f18566a 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -409,6 +409,51 @@ static void *threadproc(void *ctx) } } +#ifdef __i386__ + +#ifndef SA_RESTORE +#define SA_RESTORER 0x04000000 +#endif + +/* + * The UAPI header calls this 'struct sigaction', which conflicts with + * glibc. Sigh. + */ +struct fake_ksigaction { + void *handler; /* the real type is nasty */ + unsigned long sa_flags; + void (*sa_restorer)(void); + unsigned char sigset[8]; +}; + +static void fix_sa_restorer(int sig) +{ + struct fake_ksigaction ksa; + + if (syscall(SYS_rt_sigaction, sig, NULL, &ksa, 8) == 0) { + /* + * glibc has a nasty bug: it sometimes writes garbage to + * sa_restorer. This interacts quite badly with anything + * that fiddles with SS because it can trigger legacy + * stack switching. Patch it up. See: + * + * https://sourceware.org/bugzilla/show_bug.cgi?id=21269 + */ + if (!(ksa.sa_flags & SA_RESTORER) && ksa.sa_restorer) { + ksa.sa_restorer = NULL; + if (syscall(SYS_rt_sigaction, sig, &ksa, NULL, + sizeof(ksa.sigset)) != 0) + err(1, "rt_sigaction"); + } + } +} +#else +static void fix_sa_restorer(int sig) +{ + /* 64-bit glibc works fine. */ +} +#endif + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -420,6 +465,7 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), if (sigaction(sig, &sa, 0)) err(1, "sigaction"); + fix_sa_restorer(sig); } static jmp_buf jmpbuf; diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 571b64a01c50..8d1da1af4b09 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -360,29 +360,6 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) return ret; } -static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, - struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - u32 reg = 0; - - mutex_lock(&its->cmd_lock); - if (its->creadr == its->cwriter) - reg |= GITS_CTLR_QUIESCENT; - if (its->enabled) - reg |= GITS_CTLR_ENABLE; - mutex_unlock(&its->cmd_lock); - - return reg; -} - -static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) -{ - its->enabled = !!(val & GITS_CTLR_ENABLE); -} - static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len) @@ -1161,33 +1138,16 @@ static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, #define ITS_CMD_SIZE 32 #define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) -/* - * By writing to CWRITER the guest announces new commands to be processed. - * To avoid any races in the first place, we take the its_cmd lock, which - * protects our ring buffer variables, so that there is only one user - * per ITS handling commands at a given time. - */ -static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len, - unsigned long val) +/* Must be called with the cmd_lock held. */ +static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) { gpa_t cbaser; u64 cmd_buf[4]; - u32 reg; - if (!its) - return; - - mutex_lock(&its->cmd_lock); - - reg = update_64bit_reg(its->cwriter, addr & 7, len, val); - reg = ITS_CMD_OFFSET(reg); - if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { - mutex_unlock(&its->cmd_lock); + /* Commands are only processed when the ITS is enabled. */ + if (!its->enabled) return; - } - its->cwriter = reg; cbaser = CBASER_ADDRESS(its->cbaser); while (its->cwriter != its->creadr) { @@ -1207,6 +1167,34 @@ static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) its->creadr = 0; } +} + +/* + * By writing to CWRITER the guest announces new commands to be processed. + * To avoid any races in the first place, we take the its_cmd lock, which + * protects our ring buffer variables, so that there is only one user + * per ITS handling commands at a given time. + */ +static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u64 reg; + + if (!its) + return; + + mutex_lock(&its->cmd_lock); + + reg = update_64bit_reg(its->cwriter, addr & 7, len, val); + reg = ITS_CMD_OFFSET(reg); + if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + mutex_unlock(&its->cmd_lock); + return; + } + its->cwriter = reg; + + vgic_its_process_commands(kvm, its); mutex_unlock(&its->cmd_lock); } @@ -1287,6 +1275,39 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm, *regptr = reg; } +static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 reg = 0; + + mutex_lock(&its->cmd_lock); + if (its->creadr == its->cwriter) + reg |= GITS_CTLR_QUIESCENT; + if (its->enabled) + reg |= GITS_CTLR_ENABLE; + mutex_unlock(&its->cmd_lock); + + return reg; +} + +static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + mutex_lock(&its->cmd_lock); + + its->enabled = !!(val & GITS_CTLR_ENABLE); + + /* + * Try to process any pending commands. This function bails out early + * if the ITS is disabled or no commands have been queued. + */ + vgic_its_process_commands(kvm, its); + + mutex_unlock(&its->cmd_lock); +} + #define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ { \ .reg_offset = off, \ diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 3654b4c835ef..2a5db1352722 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -180,21 +180,37 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool new_active_state) { + struct kvm_vcpu *requester_vcpu; spin_lock(&irq->irq_lock); + + /* + * The vcpu parameter here can mean multiple things depending on how + * this function is called; when handling a trap from the kernel it + * depends on the GIC version, and these functions are also called as + * part of save/restore from userspace. + * + * Therefore, we have to figure out the requester in a reliable way. + * + * When accessing VGIC state from user space, the requester_vcpu is + * NULL, which is fine, because we guarantee that no VCPUs are running + * when accessing VGIC state from user space so irq->vcpu->cpu is + * always -1. + */ + requester_vcpu = kvm_arm_get_running_vcpu(); + /* * If this virtual IRQ was written into a list register, we * have to make sure the CPU that runs the VCPU thread has - * synced back LR state to the struct vgic_irq. We can only - * know this for sure, when either this irq is not assigned to - * anyone's AP list anymore, or the VCPU thread is not - * running on any CPUs. + * synced back the LR state to the struct vgic_irq. * - * In the opposite case, we know the VCPU thread may be on its - * way back from the guest and still has to sync back this - * IRQ, so we release and re-acquire the spin_lock to let the - * other thread sync back the IRQ. + * As long as the conditions below are true, we know the VCPU thread + * may be on its way back from the guest (we kicked the VCPU thread in + * vgic_change_active_prepare) and still has to sync back this IRQ, + * so we release and re-acquire the spin_lock to let the other thread + * sync back the IRQ. */ while (irq->vcpu && /* IRQ may have state in an LR somewhere */ + irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */ irq->vcpu->cpu != -1) /* VCPU thread is running */ cond_resched_lock(&irq->irq_lock); diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index edc6ee2dc852..be0f4c3e0142 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -229,10 +229,13 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) /* * If we are emulating a GICv3, we do it in an non-GICv2-compatible * way, so we force SRE to 1 to demonstrate this to the guest. + * Also, we don't support any form of IRQ/FIQ bypass. * This goes with the spec allowing the value to be RAO/WI. */ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_v3->vgic_sre = ICC_SRE_EL1_SRE; + vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | + ICC_SRE_EL1_DFB | + ICC_SRE_EL1_SRE); vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; } else { vgic_v3->vgic_sre = 0; |