diff options
363 files changed, 7014 insertions, 3961 deletions
diff --git a/.gitattributes b/.gitattributes index c9ba5bfc4036..2325c529e185 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,4 @@ *.[ch] diff=cpp *.dts diff=dts *.dts[io] diff=dts +*.rs diff=rust @@ -70,6 +70,8 @@ Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang@unisoc.com> Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang7@gmail.com> Bart Van Assche <bvanassche@acm.org> <bart.vanassche@sandisk.com> Bart Van Assche <bvanassche@acm.org> <bart.vanassche@wdc.com> +Ben Dooks <ben-linux@fluff.org> <ben.dooks@simtec.co.uk> +Ben Dooks <ben-linux@fluff.org> <ben.dooks@sifive.com> Ben Gardner <bgardner@wabtec.com> Ben M Cahill <ben.m.cahill@intel.com> Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net> @@ -181,6 +183,8 @@ Henrik Rydberg <rydberg@bitmath.org> Herbert Xu <herbert@gondor.apana.org.au> Huacai Chen <chenhuacai@kernel.org> <chenhc@lemote.com> Huacai Chen <chenhuacai@kernel.org> <chenhuacai@loongson.cn> +J. Bruce Fields <bfields@fieldses.org> <bfields@redhat.com> +J. Bruce Fields <bfields@fieldses.org> <bfields@citi.umich.edu> Jacob Shin <Jacob.Shin@amd.com> Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@google.com> Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk.kim@samsung.com> diff --git a/Documentation/devicetree/bindings/firmware/qcom,scm.yaml b/Documentation/devicetree/bindings/firmware/qcom,scm.yaml index 367d04ad1923..83381f3a1341 100644 --- a/Documentation/devicetree/bindings/firmware/qcom,scm.yaml +++ b/Documentation/devicetree/bindings/firmware/qcom,scm.yaml @@ -71,6 +71,8 @@ properties: minItems: 1 maxItems: 3 + dma-coherent: true + interconnects: maxItems: 1 diff --git a/Documentation/devicetree/bindings/i2c/opencores,i2c-ocores.yaml b/Documentation/devicetree/bindings/i2c/opencores,i2c-ocores.yaml index 85d9efb743ee..d9ef86729011 100644 --- a/Documentation/devicetree/bindings/i2c/opencores,i2c-ocores.yaml +++ b/Documentation/devicetree/bindings/i2c/opencores,i2c-ocores.yaml @@ -60,6 +60,7 @@ properties: default: 0 regstep: + $ref: /schemas/types.yaml#/definitions/uint32 description: | deprecated, use reg-shift above deprecated: true diff --git a/Documentation/filesystems/directory-locking.rst b/Documentation/filesystems/directory-locking.rst index 504ba940c36c..dccd61c7c5c3 100644 --- a/Documentation/filesystems/directory-locking.rst +++ b/Documentation/filesystems/directory-locking.rst @@ -22,12 +22,11 @@ exclusive. 3) object removal. Locking rules: caller locks parent, finds victim, locks victim and calls the method. Locks are exclusive. -4) rename() that is _not_ cross-directory. Locking rules: caller locks -the parent and finds source and target. In case of exchange (with -RENAME_EXCHANGE in flags argument) lock both. In any case, -if the target already exists, lock it. If the source is a non-directory, -lock it. If we need to lock both, lock them in inode pointer order. -Then call the method. All locks are exclusive. +4) rename() that is _not_ cross-directory. Locking rules: caller locks the +parent and finds source and target. We lock both (provided they exist). If we +need to lock two inodes of different type (dir vs non-dir), we lock directory +first. If we need to lock two inodes of the same type, lock them in inode +pointer order. Then call the method. All locks are exclusive. NB: we might get away with locking the source (and target in exchange case) shared. @@ -44,15 +43,17 @@ All locks are exclusive. rules: * lock the filesystem - * lock parents in "ancestors first" order. + * lock parents in "ancestors first" order. If one is not ancestor of + the other, lock them in inode pointer order. * find source and target. * if old parent is equal to or is a descendent of target fail with -ENOTEMPTY * if new parent is equal to or is a descendent of source fail with -ELOOP - * If it's an exchange, lock both the source and the target. - * If the target exists, lock it. If the source is a non-directory, - lock it. If we need to lock both, do so in inode pointer order. + * Lock both the source and the target provided they exist. If we + need to lock two inodes of different type (dir vs non-dir), we lock + the directory first. If we need to lock two inodes of the same type, + lock them in inode pointer order. * call the method. All ->i_rwsem are taken exclusive. Again, we might get away with locking @@ -66,8 +67,9 @@ If no directory is its own ancestor, the scheme above is deadlock-free. Proof: - First of all, at any moment we have a partial ordering of the - objects - A < B iff A is an ancestor of B. + First of all, at any moment we have a linear ordering of the + objects - A < B iff (A is an ancestor of B) or (B is not an ancestor + of A and ptr(A) < ptr(B)). That ordering can change. However, the following is true: diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index ede672dedf11..cb845e8e5435 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -38,20 +38,14 @@ fail at runtime. Use cases ========= -By itself, the base fs-verity feature only provides integrity -protection, i.e. detection of accidental (non-malicious) corruption. +By itself, fs-verity only provides integrity protection, i.e. +detection of accidental (non-malicious) corruption. However, because fs-verity makes retrieving the file hash extremely efficient, it's primarily meant to be used as a tool to support authentication (detection of malicious modifications) or auditing (logging file hashes before use). -Trusted userspace code (e.g. operating system code running on a -read-only partition that is itself authenticated by dm-verity) can -authenticate the contents of an fs-verity file by using the -`FS_IOC_MEASURE_VERITY`_ ioctl to retrieve its hash, then verifying a -digital signature of it. - A standard file hash could be used instead of fs-verity. However, this is inefficient if the file is large and only a small portion may be accessed. This is often the case for Android application package @@ -69,24 +63,31 @@ still be used on read-only filesystems. fs-verity is for files that must live on a read-write filesystem because they are independently updated and potentially user-installed, so dm-verity cannot be used. -The base fs-verity feature is a hashing mechanism only; actually -authenticating the files may be done by: - -* Userspace-only - -* Builtin signature verification + userspace policy - - fs-verity optionally supports a simple signature verification - mechanism where users can configure the kernel to require that - all fs-verity files be signed by a key loaded into a keyring; - see `Built-in signature verification`_. - -* Integrity Measurement Architecture (IMA) - - IMA supports including fs-verity file digests and signatures in the - IMA measurement list and verifying fs-verity based file signatures - stored as security.ima xattrs, based on policy. - +fs-verity does not mandate a particular scheme for authenticating its +file hashes. (Similarly, dm-verity does not mandate a particular +scheme for authenticating its block device root hashes.) Options for +authenticating fs-verity file hashes include: + +- Trusted userspace code. Often, the userspace code that accesses + files can be trusted to authenticate them. Consider e.g. an + application that wants to authenticate data files before using them, + or an application loader that is part of the operating system (which + is already authenticated in a different way, such as by being loaded + from a read-only partition that uses dm-verity) and that wants to + authenticate applications before loading them. In these cases, this + trusted userspace code can authenticate a file's contents by + retrieving its fs-verity digest using `FS_IOC_MEASURE_VERITY`_, then + verifying a signature of it using any userspace cryptographic + library that supports digital signatures. + +- Integrity Measurement Architecture (IMA). IMA supports fs-verity + file digests as an alternative to its traditional full file digests. + "IMA appraisal" enforces that files contain a valid, matching + signature in their "security.ima" extended attribute, as controlled + by the IMA policy. For more information, see the IMA documentation. + +- Trusted userspace code in combination with `Built-in signature + verification`_. This approach should be used only with great care. User API ======== @@ -111,8 +112,7 @@ follows:: }; This structure contains the parameters of the Merkle tree to build for -the file, and optionally contains a signature. It must be initialized -as follows: +the file. It must be initialized as follows: - ``version`` must be 1. - ``hash_algorithm`` must be the identifier for the hash algorithm to @@ -129,12 +129,14 @@ as follows: file or device. Currently the maximum salt size is 32 bytes. - ``salt_ptr`` is the pointer to the salt, or NULL if no salt is provided. -- ``sig_size`` is the size of the signature in bytes, or 0 if no - signature is provided. Currently the signature is (somewhat - arbitrarily) limited to 16128 bytes. See `Built-in signature - verification`_ for more information. -- ``sig_ptr`` is the pointer to the signature, or NULL if no - signature is provided. +- ``sig_size`` is the size of the builtin signature in bytes, or 0 if no + builtin signature is provided. Currently the builtin signature is + (somewhat arbitrarily) limited to 16128 bytes. +- ``sig_ptr`` is the pointer to the builtin signature, or NULL if no + builtin signature is provided. A builtin signature is only needed + if the `Built-in signature verification`_ feature is being used. It + is not needed for IMA appraisal, and it is not needed if the file + signature is being handled entirely in userspace. - All reserved fields must be zeroed. FS_IOC_ENABLE_VERITY causes the filesystem to build a Merkle tree for @@ -158,7 +160,7 @@ fatal signal), no changes are made to the file. FS_IOC_ENABLE_VERITY can fail with the following errors: - ``EACCES``: the process does not have write access to the file -- ``EBADMSG``: the signature is malformed +- ``EBADMSG``: the builtin signature is malformed - ``EBUSY``: this ioctl is already running on the file - ``EEXIST``: the file already has verity enabled - ``EFAULT``: the caller provided inaccessible memory @@ -168,10 +170,10 @@ FS_IOC_ENABLE_VERITY can fail with the following errors: reserved bits are set; or the file descriptor refers to neither a regular file nor a directory. - ``EISDIR``: the file descriptor refers to a directory -- ``EKEYREJECTED``: the signature doesn't match the file -- ``EMSGSIZE``: the salt or signature is too long -- ``ENOKEY``: the fs-verity keyring doesn't contain the certificate - needed to verify the signature +- ``EKEYREJECTED``: the builtin signature doesn't match the file +- ``EMSGSIZE``: the salt or builtin signature is too long +- ``ENOKEY``: the ".fs-verity" keyring doesn't contain the certificate + needed to verify the builtin signature - ``ENOPKG``: fs-verity recognizes the hash algorithm, but it's not available in the kernel's crypto API as currently configured (e.g. for SHA-512, missing CONFIG_CRYPTO_SHA512). @@ -180,8 +182,8 @@ FS_IOC_ENABLE_VERITY can fail with the following errors: support; or the filesystem superblock has not had the 'verity' feature enabled on it; or the filesystem does not support fs-verity on this file. (See `Filesystem support`_.) -- ``EPERM``: the file is append-only; or, a signature is required and - one was not provided. +- ``EPERM``: the file is append-only; or, a builtin signature is + required and one was not provided. - ``EROFS``: the filesystem is read-only - ``ETXTBSY``: someone has the file open for writing. This can be the caller's file descriptor, another open file descriptor, or the file @@ -270,9 +272,9 @@ This ioctl takes in a pointer to the following structure:: - ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity descriptor. See `fs-verity descriptor`_. -- ``FS_VERITY_METADATA_TYPE_SIGNATURE`` reads the signature which was - passed to FS_IOC_ENABLE_VERITY, if any. See `Built-in signature - verification`_. +- ``FS_VERITY_METADATA_TYPE_SIGNATURE`` reads the builtin signature + which was passed to FS_IOC_ENABLE_VERITY, if any. See `Built-in + signature verification`_. The semantics are similar to those of ``pread()``. ``offset`` specifies the offset in bytes into the metadata item to read from, and @@ -299,7 +301,7 @@ FS_IOC_READ_VERITY_METADATA can fail with the following errors: overflowed - ``ENODATA``: the file is not a verity file, or FS_VERITY_METADATA_TYPE_SIGNATURE was requested but the file doesn't - have a built-in signature + have a builtin signature - ``ENOTTY``: this type of filesystem does not implement fs-verity, or this ioctl is not yet implemented on it - ``EOPNOTSUPP``: the kernel was not configured with fs-verity @@ -347,8 +349,8 @@ non-verity one, with the following exceptions: with EIO (for read()) or SIGBUS (for mmap() reads). - If the sysctl "fs.verity.require_signatures" is set to 1 and the - file is not signed by a key in the fs-verity keyring, then opening - the file will fail. See `Built-in signature verification`_. + file is not signed by a key in the ".fs-verity" keyring, then + opening the file will fail. See `Built-in signature verification`_. Direct access to the Merkle tree is not supported. Therefore, if a verity file is copied, or is backed up and restored, then it will lose @@ -433,20 +435,25 @@ root hash as well as other fields such as the file size:: Built-in signature verification =============================== -With CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y, fs-verity supports putting -a portion of an authentication policy (see `Use cases`_) in the -kernel. Specifically, it adds support for: +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y adds supports for in-kernel +verification of fs-verity builtin signatures. + +**IMPORTANT**! Please take great care before using this feature. +It is not the only way to do signatures with fs-verity, and the +alternatives (such as userspace signature verification, and IMA +appraisal) can be much better. It's also easy to fall into a trap +of thinking this feature solves more problems than it actually does. + +Enabling this option adds the following: -1. At fs-verity module initialization time, a keyring ".fs-verity" is - created. The root user can add trusted X.509 certificates to this - keyring using the add_key() system call, then (when done) - optionally use keyctl_restrict_keyring() to prevent additional - certificates from being added. +1. At boot time, the kernel creates a keyring named ".fs-verity". The + root user can add trusted X.509 certificates to this keyring using + the add_key() system call. 2. `FS_IOC_ENABLE_VERITY`_ accepts a pointer to a PKCS#7 formatted detached signature in DER format of the file's fs-verity digest. - On success, this signature is persisted alongside the Merkle tree. - Then, any time the file is opened, the kernel will verify the + On success, the ioctl persists the signature alongside the Merkle + tree. Then, any time the file is opened, the kernel verifies the file's actual digest against this signature, using the certificates in the ".fs-verity" keyring. @@ -454,8 +461,8 @@ kernel. Specifically, it adds support for: When set to 1, the kernel requires that all verity files have a correctly signed digest as described in (2). -fs-verity file digests must be signed in the following format, which -is similar to the structure used by `FS_IOC_MEASURE_VERITY`_:: +The data that the signature as described in (2) must be a signature of +is the fs-verity file digest in the following format:: struct fsverity_formatted_digest { char magic[8]; /* must be "FSVerity" */ @@ -464,13 +471,66 @@ is similar to the structure used by `FS_IOC_MEASURE_VERITY`_:: __u8 digest[]; }; -fs-verity's built-in signature verification support is meant as a -relatively simple mechanism that can be used to provide some level of -authenticity protection for verity files, as an alternative to doing -the signature verification in userspace or using IMA-appraisal. -However, with this mechanism, userspace programs still need to check -that the verity bit is set, and there is no protection against verity -files being swapped around. +That's it. It should be emphasized again that fs-verity builtin +signatures are not the only way to do signatures with fs-verity. See +`Use cases`_ for an overview of ways in which fs-verity can be used. +fs-verity builtin signatures have some major limitations that should +be carefully considered before using them: + +- Builtin signature verification does *not* make the kernel enforce + that any files actually have fs-verity enabled. Thus, it is not a + complete authentication policy. Currently, if it is used, the only + way to complete the authentication policy is for trusted userspace + code to explicitly check whether files have fs-verity enabled with a + signature before they are accessed. (With + fs.verity.require_signatures=1, just checking whether fs-verity is + enabled suffices.) But, in this case the trusted userspace code + could just store the signature alongside the file and verify it + itself using a cryptographic library, instead of using this feature. + +- A file's builtin signature can only be set at the same time that + fs-verity is being enabled on the file. Changing or deleting the + builtin signature later requires re-creating the file. + +- Builtin signature verification uses the same set of public keys for + all fs-verity enabled files on the system. Different keys cannot be + trusted for different files; each key is all or nothing. + +- The sysctl fs.verity.require_signatures applies system-wide. + Setting it to 1 only works when all users of fs-verity on the system + agree that it should be set to 1. This limitation can prevent + fs-verity from being used in cases where it would be helpful. + +- Builtin signature verification can only use signature algorithms + that are supported by the kernel. For example, the kernel does not + yet support Ed25519, even though this is often the signature + algorithm that is recommended for new cryptographic designs. + +- fs-verity builtin signatures are in PKCS#7 format, and the public + keys are in X.509 format. These formats are commonly used, + including by some other kernel features (which is why the fs-verity + builtin signatures use them), and are very feature rich. + Unfortunately, history has shown that code that parses and handles + these formats (which are from the 1990s and are based on ASN.1) + often has vulnerabilities as a result of their complexity. This + complexity is not inherent to the cryptography itself. + + fs-verity users who do not need advanced features of X.509 and + PKCS#7 should strongly consider using simpler formats, such as plain + Ed25519 keys and signatures, and verifying signatures in userspace. + + fs-verity users who choose to use X.509 and PKCS#7 anyway should + still consider that verifying those signatures in userspace is more + flexible (for other reasons mentioned earlier in this document) and + eliminates the need to enable CONFIG_FS_VERITY_BUILTIN_SIGNATURES + and its associated increase in kernel attack surface. In some cases + it can even be necessary, since advanced X.509 and PKCS#7 features + do not always work as intended with the kernel. For example, the + kernel does not check X.509 certificate validity times. + + Note: IMA appraisal, which supports fs-verity, does not use PKCS#7 + for its signatures, so it partially avoids the issues discussed + here. IMA appraisal does use X.509. Filesystem support ================== diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index ef540865ad22..5cf6a5f8ca57 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -31,7 +31,7 @@ you probably needn't concern yourself with pcmciautils. ====================== =============== ======================================== GNU C 5.1 gcc --version Clang/LLVM (optional) 11.0.0 clang --version -Rust (optional) 1.62.0 rustc --version +Rust (optional) 1.68.2 rustc --version bindgen (optional) 0.56.0 bindgen --version GNU make 3.82 make --version bash 4.2 bash --version diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 13b7744b1e27..a8931512ed98 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -38,9 +38,9 @@ and run:: rustup override set $(scripts/min-tool-version.sh rustc) -Otherwise, fetch a standalone installer or install ``rustup`` from: +Otherwise, fetch a standalone installer from: - https://www.rust-lang.org + https://forge.rust-lang.org/infra/other-installation-methods.html#standalone Rust standard library source diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index f79987e16cf4..e7b07313550a 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -14,10 +14,6 @@ Programs can view status of the events via /sys/kernel/tracing/user_events_status and can both register and write data out via /sys/kernel/tracing/user_events_data. -Programs can also use /sys/kernel/tracing/dynamic_events to register and -delete user based events via the u: prefix. The format of the command to -dynamic_events is the same as the ioctl with the u: prefix applied. - Typically programs will register a set of events that they wish to expose to tools that can read trace_events (such as ftrace and perf). The registration process tells the kernel which address and bit to reflect if any tool has @@ -144,6 +140,9 @@ its name. Delete will only succeed if there are no references left to the event (in both user and kernel space). User programs should use a separate file to request deletes than the one used for registration due to this. +**NOTE:** By default events will auto-delete when there are no references left +to the event. Flags in the future may change this logic. + Unregistering ------------- If after registering an event it is no longer wanted to be updated then it can diff --git a/MAINTAINERS b/MAINTAINERS index 6992b7cc7095..233b9a3b3193 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9972,8 +9972,9 @@ M: Miquel Raynal <miquel.raynal@bootlin.com> L: linux-wpan@vger.kernel.org S: Maintained W: https://linux-wpan.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan-next.git +Q: https://patchwork.kernel.org/project/linux-wpan/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git F: Documentation/networking/ieee802154.rst F: drivers/net/ieee802154/ F: include/linux/ieee802154.h @@ -11274,6 +11275,10 @@ W: http://kernelnewbies.org/KernelJanitors KERNEL NFSD, SUNRPC, AND LOCKD SERVERS M: Chuck Lever <chuck.lever@oracle.com> M: Jeff Layton <jlayton@kernel.org> +R: Neil Brown <neilb@suse.de> +R: Olga Kornievskaia <kolga@netapp.com> +R: Dai Ngo <Dai.Ngo@oracle.com> +R: Tom Talpey <tom@talpey.com> L: linux-nfs@vger.kernel.org S: Supported W: http://nfs.sourceforge.net/ @@ -13269,10 +13274,11 @@ F: drivers/memory/mtk-smi.c F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER -M: Sean Wang <sean.wang@mediatek.com> +M: Arınç ÃœNAL <arinc.unal@arinc9.com> +M: Daniel Golle <daniel@makrotopia.org> M: Landen Chao <Landen.Chao@mediatek.com> M: DENG Qingfang <dqfext@gmail.com> -M: Daniel Golle <daniel@makrotopia.org> +M: Sean Wang <sean.wang@mediatek.com> L: netdev@vger.kernel.org S: Maintained F: drivers/net/dsa/mt7530-mdio.c @@ -16384,7 +16390,7 @@ F: Documentation/devicetree/bindings/pci/intel,keembay-pcie* F: drivers/pci/controller/dwc/pcie-keembay.c PCIE DRIVER FOR INTEL LGM GW SOC -M: Rahul Tanwar <rtanwar@maxlinear.com> +M: Chuanhua Lei <lchuanhua@maxlinear.com> L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/intel-gw-pcie.yaml @@ -17827,7 +17833,7 @@ F: tools/testing/selftests/rtc/ Real-time Linux Analysis (RTLA) tools M: Daniel Bristot de Oliveira <bristot@kernel.org> M: Steven Rostedt <rostedt@goodmis.org> -L: linux-trace-devel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/tools/rtla/ F: tools/tracing/rtla/ @@ -18397,7 +18403,7 @@ F: drivers/infiniband/ulp/rtrs/ RUNTIME VERIFICATION (RV) M: Daniel Bristot de Oliveira <bristot@kernel.org> M: Steven Rostedt <rostedt@goodmis.org> -L: linux-trace-devel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/trace/rv/ F: include/linux/rv.h @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index f4db3e75d75f..f3cd04ff022d 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -222,6 +222,11 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) return false; } +static inline bool kvm_set_pmuserenr(u64 val) +{ + return false; +} + /* PMU Version in DFR Register */ #define ARMV8_PMU_DFR_VER_NI 0 #define ARMV8_PMU_DFR_VER_V3P4 0x5 diff --git a/arch/arm64/boot/dts/qcom/sc7180-idp.dts b/arch/arm64/boot/dts/qcom/sc7180-idp.dts index 9f052270e090..299ef5dc225a 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-idp.dts +++ b/arch/arm64/boot/dts/qcom/sc7180-idp.dts @@ -393,6 +393,11 @@ qcom,spare-regs = <&tcsr_regs_2 0xb3e4>; }; +&scm { + /* TF-A firmware maps memory cached so mark dma-coherent to match. */ + dma-coherent; +}; + &sdhc_1 { status = "okay"; diff --git a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi index ca6920de7ea8..1472e7f10831 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi @@ -892,6 +892,11 @@ hp_i2c: &i2c9 { qcom,spare-regs = <&tcsr_regs_2 0xb3e4>; }; +&scm { + /* TF-A firmware maps memory cached so mark dma-coherent to match. */ + dma-coherent; +}; + &sdhc_1 { status = "okay"; diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi b/arch/arm64/boot/dts/qcom/sc7180.dtsi index f479cab8ab45..a65be760d1a7 100644 --- a/arch/arm64/boot/dts/qcom/sc7180.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi @@ -369,7 +369,7 @@ }; firmware { - scm { + scm: scm { compatible = "qcom,scm-sc7180", "qcom,scm"; }; }; diff --git a/arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi b/arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi index f562e4d2b655..2e1cd219fc18 100644 --- a/arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7280-chrome-common.dtsi @@ -79,6 +79,11 @@ firmware-name = "ath11k/WCN6750/hw1.0/wpss.mdt"; }; +&scm { + /* TF-A firmware maps memory cached so mark dma-coherent to match. */ + dma-coherent; +}; + &wifi { status = "okay"; diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi b/arch/arm64/boot/dts/qcom/sc7280.dtsi index 2fd1d3c0eb34..36f0bb9b3cbb 100644 --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi @@ -656,7 +656,7 @@ }; firmware { - scm { + scm: scm { compatible = "qcom,scm-sc7280", "qcom,scm"; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3308.dtsi b/arch/arm64/boot/dts/rockchip/rk3308.dtsi index dd228a256a32..2ae4bb7d5e62 100644 --- a/arch/arm64/boot/dts/rockchip/rk3308.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3308.dtsi @@ -97,6 +97,7 @@ l2: l2-cache { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts index f69a38f42d2d..0a27fa5271f5 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts @@ -37,7 +37,8 @@ vin-supply = <&vcc_io>; }; - vcc_host_5v: vcc-host-5v-regulator { + /* Common enable line for all of the rails mentioned in the labels */ + vcc_host_5v: vcc_host1_5v: vcc_otg_5v: vcc-host-5v-regulator { compatible = "regulator-fixed"; gpio = <&gpio0 RK_PA2 GPIO_ACTIVE_LOW>; pinctrl-names = "default"; @@ -48,17 +49,6 @@ vin-supply = <&vcc_sys>; }; - vcc_host1_5v: vcc_otg_5v: vcc-host1-5v-regulator { - compatible = "regulator-fixed"; - gpio = <&gpio0 RK_PA2 GPIO_ACTIVE_LOW>; - pinctrl-names = "default"; - pinctrl-0 = <&usb20_host_drv>; - regulator-name = "vcc_host1_5v"; - regulator-always-on; - regulator-boot-on; - vin-supply = <&vcc_sys>; - }; - vcc_sys: vcc-sys { compatible = "regulator-fixed"; regulator-name = "vcc_sys"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 6d7a7bf72ac7..e729e7a22b23 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -103,6 +103,7 @@ l2: l2-cache0 { compatible = "cache"; cache-level = <2>; + cache-unified; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-soquartz-cm4.dts b/arch/arm64/boot/dts/rockchip/rk3566-soquartz-cm4.dts index 263ce40770dd..cddf6cd2fecb 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-soquartz-cm4.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-soquartz-cm4.dts @@ -28,6 +28,16 @@ regulator-max-microvolt = <5000000>; vin-supply = <&vcc12v_dcin>; }; + + vcc_sd_pwr: vcc-sd-pwr-regulator { + compatible = "regulator-fixed"; + regulator-name = "vcc_sd_pwr"; + regulator-always-on; + regulator-boot-on; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + vin-supply = <&vcc3v3_sys>; + }; }; /* phy for pcie */ @@ -130,13 +140,7 @@ }; &sdmmc0 { - vmmc-supply = <&sdmmc_pwr>; - status = "okay"; -}; - -&sdmmc_pwr { - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; + vmmc-supply = <&vcc_sd_pwr>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi index 102e448bc026..31aa2b8efe39 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-soquartz.dtsi @@ -104,16 +104,6 @@ regulator-max-microvolt = <3300000>; vin-supply = <&vcc5v0_sys>; }; - - sdmmc_pwr: sdmmc-pwr-regulator { - compatible = "regulator-fixed"; - enable-active-high; - gpio = <&gpio0 RK_PA5 GPIO_ACTIVE_HIGH>; - pinctrl-names = "default"; - pinctrl-0 = <&sdmmc_pwr_h>; - regulator-name = "sdmmc_pwr"; - status = "disabled"; - }; }; &cpu0 { @@ -155,6 +145,19 @@ status = "disabled"; }; +&gpio0 { + nextrst-hog { + gpio-hog; + /* + * GPIO_ACTIVE_LOW + output-low here means that the pin is set + * to high, because output-low decides the value pre-inversion. + */ + gpios = <RK_PA5 GPIO_ACTIVE_LOW>; + line-name = "nEXTRST"; + output-low; + }; +}; + &gpu { mali-supply = <&vdd_gpu>; status = "okay"; @@ -538,12 +541,6 @@ rockchip,pins = <2 RK_PC2 RK_FUNC_GPIO &pcfg_pull_none>; }; }; - - sdmmc-pwr { - sdmmc_pwr_h: sdmmc-pwr-h { - rockchip,pins = <0 RK_PA5 RK_FUNC_GPIO &pcfg_pull_none>; - }; - }; }; &pmu_io_domains { diff --git a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5c.dts b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5c.dts index f70ca9f0470a..c718b8dbb9c6 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5c.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5c.dts @@ -106,7 +106,7 @@ rockchip-key { reset_button_pin: reset-button-pin { - rockchip,pins = <4 RK_PA0 RK_FUNC_GPIO &pcfg_pull_up>; + rockchip,pins = <0 RK_PB7 RK_FUNC_GPIO &pcfg_pull_up>; }; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts index 2a1118f15c29..b6ad8328c7eb 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-nanopi-r5s.dts @@ -134,4 +134,3 @@ }; }; }; - diff --git a/arch/arm64/boot/dts/rockchip/rk3568.dtsi b/arch/arm64/boot/dts/rockchip/rk3568.dtsi index ba67b58f05b7..f1be76a54ceb 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3568.dtsi @@ -94,9 +94,10 @@ power-domains = <&power RK3568_PD_PIPE>; reg = <0x3 0xc0400000 0x0 0x00400000>, <0x0 0xfe270000 0x0 0x00010000>, - <0x3 0x7f000000 0x0 0x01000000>; - ranges = <0x01000000 0x0 0x3ef00000 0x3 0x7ef00000 0x0 0x00100000>, - <0x02000000 0x0 0x00000000 0x3 0x40000000 0x0 0x3ef00000>; + <0x0 0xf2000000 0x0 0x00100000>; + ranges = <0x01000000 0x0 0xf2100000 0x0 0xf2100000 0x0 0x00100000>, + <0x02000000 0x0 0xf2200000 0x0 0xf2200000 0x0 0x01e00000>, + <0x03000000 0x0 0x40000000 0x3 0x40000000 0x0 0x40000000>; reg-names = "dbi", "apb", "config"; resets = <&cru SRST_PCIE30X1_POWERUP>; reset-names = "pipe"; @@ -146,9 +147,10 @@ power-domains = <&power RK3568_PD_PIPE>; reg = <0x3 0xc0800000 0x0 0x00400000>, <0x0 0xfe280000 0x0 0x00010000>, - <0x3 0xbf000000 0x0 0x01000000>; - ranges = <0x01000000 0x0 0x3ef00000 0x3 0xbef00000 0x0 0x00100000>, - <0x02000000 0x0 0x00000000 0x3 0x80000000 0x0 0x3ef00000>; + <0x0 0xf0000000 0x0 0x00100000>; + ranges = <0x01000000 0x0 0xf0100000 0x0 0xf0100000 0x0 0x00100000>, + <0x02000000 0x0 0xf0200000 0x0 0xf0200000 0x0 0x01e00000>, + <0x03000000 0x0 0x40000000 0x3 0x80000000 0x0 0x40000000>; reg-names = "dbi", "apb", "config"; resets = <&cru SRST_PCIE30X2_POWERUP>; reset-names = "pipe"; diff --git a/arch/arm64/boot/dts/rockchip/rk356x.dtsi b/arch/arm64/boot/dts/rockchip/rk356x.dtsi index f62e0fd881a9..61680c7ac489 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x.dtsi @@ -952,7 +952,7 @@ compatible = "rockchip,rk3568-pcie"; reg = <0x3 0xc0000000 0x0 0x00400000>, <0x0 0xfe260000 0x0 0x00010000>, - <0x3 0x3f000000 0x0 0x01000000>; + <0x0 0xf4000000 0x0 0x00100000>; reg-names = "dbi", "apb", "config"; interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>, @@ -982,8 +982,9 @@ phys = <&combphy2 PHY_TYPE_PCIE>; phy-names = "pcie-phy"; power-domains = <&power RK3568_PD_PIPE>; - ranges = <0x01000000 0x0 0x3ef00000 0x3 0x3ef00000 0x0 0x00100000 - 0x02000000 0x0 0x00000000 0x3 0x00000000 0x0 0x3ef00000>; + ranges = <0x01000000 0x0 0xf4100000 0x0 0xf4100000 0x0 0x00100000>, + <0x02000000 0x0 0xf4200000 0x0 0xf4200000 0x0 0x01e00000>, + <0x03000000 0x0 0x40000000 0x3 0x00000000 0x0 0x40000000>; resets = <&cru SRST_PCIE20_POWERUP>; reset-names = "pipe"; #address-cells = <3>; diff --git a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi index 657c019d27fa..a3124bd2e092 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi @@ -229,6 +229,7 @@ cache-line-size = <64>; cache-sets = <512>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -238,6 +239,7 @@ cache-line-size = <64>; cache-sets = <512>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -247,6 +249,7 @@ cache-line-size = <64>; cache-sets = <512>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -256,6 +259,7 @@ cache-line-size = <64>; cache-sets = <512>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -265,6 +269,7 @@ cache-line-size = <64>; cache-sets = <1024>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -274,6 +279,7 @@ cache-line-size = <64>; cache-sets = <1024>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -283,6 +289,7 @@ cache-line-size = <64>; cache-sets = <1024>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -292,6 +299,7 @@ cache-line-size = <64>; cache-sets = <1024>; cache-level = <2>; + cache-unified; next-level-cache = <&l3_cache>; }; @@ -301,6 +309,7 @@ cache-line-size = <64>; cache-sets = <4096>; cache-level = <3>; + cache-unified; }; }; diff --git a/arch/arm64/hyperv/mshyperv.c b/arch/arm64/hyperv/mshyperv.c index a406454578f0..f1b8a04ee9f2 100644 --- a/arch/arm64/hyperv/mshyperv.c +++ b/arch/arm64/hyperv/mshyperv.c @@ -67,7 +67,7 @@ static int __init hyperv_init(void) if (ret) return ret; - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm64/hyperv_init:online", + ret = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "arm64/hyperv_init:online", hv_common_cpu_init, hv_common_cpu_die); if (ret < 0) { hv_common_free(); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e7e19ef6993..9787503ff43f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -699,6 +699,8 @@ struct kvm_vcpu_arch { #define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) /* Software step state is Active-pending */ #define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5)) +/* PMUSERENR for the guest EL0 is on physical CPU */ +#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ @@ -1065,9 +1067,14 @@ void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u32 clr); +bool kvm_set_pmuserenr(u64 val); #else static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} +static inline bool kvm_set_pmuserenr(u64 val) +{ + return false; +} #endif void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5c15c58f90cc..4fe217efa218 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -82,8 +82,14 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * EL1 instead of being trapped to EL2. */ if (kvm_arm_support_pmu_v3()) { + struct kvm_cpu_context *hctxt; + write_sysreg(0, pmselr_el0); + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); @@ -106,8 +112,13 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); write_sysreg(0, hstr_el2); - if (kvm_arm_support_pmu_v3()) - write_sysreg(0, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) { + struct kvm_cpu_context *hctxt; + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); + vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); + } if (cpus_have_final_cap(ARM64_SME)) { sysreg_clear_set_s(SYS_HFGRTR_EL2, 0, diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 7a1aa511e7da..b37e7c96efea 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -92,14 +92,28 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) } NOKPROBE_SYMBOL(__deactivate_traps); +/* + * Disable IRQs in {activate,deactivate}_traps_vhe_{load,put}() to + * prevent a race condition between context switching of PMUSERENR_EL0 + * in __{activate,deactivate}_traps_common() and IPIs that attempts to + * update PMUSERENR_EL0. See also kvm_set_pmuserenr(). + */ void activate_traps_vhe_load(struct kvm_vcpu *vcpu) { + unsigned long flags; + + local_irq_save(flags); __activate_traps_common(vcpu); + local_irq_restore(flags); } void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) { + unsigned long flags; + + local_irq_save(flags); __deactivate_traps_common(vcpu); + local_irq_restore(flags); } static const exit_handler_fn hyp_exit_handlers[] = { diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 491ca7eb2a4c..560650972478 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -700,7 +700,25 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) mutex_lock(&arm_pmus_lock); - cpu = smp_processor_id(); + /* + * It is safe to use a stale cpu to iterate the list of PMUs so long as + * the same value is used for the entirety of the loop. Given this, and + * the fact that no percpu data is used for the lookup there is no need + * to disable preemption. + * + * It is still necessary to get a valid cpu, though, to probe for the + * default PMU instance as userspace is not required to specify a PMU + * type. In order to uphold the preexisting behavior KVM selects the + * PMU instance for the core where the first call to the + * KVM_ARM_VCPU_PMU_V3_CTRL attribute group occurs. A dependent use case + * would be a user with disdain of all things big.LITTLE that affines + * the VMM to a particular cluster of cores. + * + * In any case, userspace should just do the sane thing and use the UAPI + * to select a PMU type directly. But, be wary of the baggage being + * carried here. + */ + cpu = raw_smp_processor_id(); list_for_each_entry(entry, &arm_pmus, entry) { tmp = entry->arm_pmu; diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 7887133d15f0..121f1a14c829 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -209,3 +209,30 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) kvm_vcpu_pmu_enable_el0(events_host); kvm_vcpu_pmu_disable_el0(events_guest); } + +/* + * With VHE, keep track of the PMUSERENR_EL0 value for the host EL0 on the pCPU + * where PMUSERENR_EL0 for the guest is loaded, since PMUSERENR_EL0 is switched + * to the value for the guest on vcpu_load(). The value for the host EL0 + * will be restored on vcpu_put(), before returning to userspace. + * This isn't necessary for nVHE, as the register is context switched for + * every guest enter/exit. + * + * Return true if KVM takes care of the register. Otherwise return false. + */ +bool kvm_set_pmuserenr(u64 val) +{ + struct kvm_cpu_context *hctxt; + struct kvm_vcpu *vcpu; + + if (!kvm_arm_support_pmu_v3() || !has_vhe()) + return false; + + vcpu = kvm_get_running_vcpu(); + if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU)) + return false; + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val; + return true; +} diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 6eafc2c45cfc..c8c3cb812783 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -446,6 +446,7 @@ int vgic_lazy_init(struct kvm *kvm) int kvm_vgic_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + enum vgic_type type; gpa_t dist_base; int ret = 0; @@ -460,10 +461,13 @@ int kvm_vgic_map_resources(struct kvm *kvm) if (!irqchip_in_kernel(kvm)) goto out; - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { ret = vgic_v2_map_resources(kvm); - else + type = VGIC_V2; + } else { ret = vgic_v3_map_resources(kvm); + type = VGIC_V3; + } if (ret) { __kvm_vgic_destroy(kvm); @@ -473,8 +477,7 @@ int kvm_vgic_map_resources(struct kvm *kvm) dist_base = dist->vgic_dist_base; mutex_unlock(&kvm->arch.config_lock); - ret = vgic_register_dist_iodev(kvm, dist_base, - kvm_vgic_global_state.type); + ret = vgic_register_dist_iodev(kvm, dist_base, type); if (ret) { kvm_err("Unable to register VGIC dist MMIO regions\n"); kvm_vgic_destroy(kvm); diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index ecd1657bb2ce..ce6bb8e74271 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -28,10 +28,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, extern pgd_t *pgd_alloc(struct mm_struct *mm); -#define __pte_free_tlb(tlb, pte, addr) \ - do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ + do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif /* _ASM_NIOS2_PGALLOC_H */ diff --git a/arch/nios2/kernel/cpuinfo.c b/arch/nios2/kernel/cpuinfo.c index 203870c4b86d..338849c430a5 100644 --- a/arch/nios2/kernel/cpuinfo.c +++ b/arch/nios2/kernel/cpuinfo.c @@ -47,7 +47,7 @@ void __init setup_cpuinfo(void) str = of_get_property(cpu, "altr,implementation", &len); if (str) - strlcpy(cpuinfo.cpu_impl, str, sizeof(cpuinfo.cpu_impl)); + strscpy(cpuinfo.cpu_impl, str, sizeof(cpuinfo.cpu_impl)); else strcpy(cpuinfo.cpu_impl, "<unknown>"); diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index 40bc8fb75e0b..8582ed965844 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -121,7 +121,7 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6, dtb_passed = r6; if (r7) - strlcpy(cmdline_passed, (char *)r7, COMMAND_LINE_SIZE); + strscpy(cmdline_passed, (char *)r7, COMMAND_LINE_SIZE); } #endif @@ -129,10 +129,10 @@ asmlinkage void __init nios2_boot_init(unsigned r4, unsigned r5, unsigned r6, #ifndef CONFIG_CMDLINE_FORCE if (cmdline_passed[0]) - strlcpy(boot_command_line, cmdline_passed, COMMAND_LINE_SIZE); + strscpy(boot_command_line, cmdline_passed, COMMAND_LINE_SIZE); #ifdef CONFIG_NIOS2_CMDLINE_IGNORE_DTB else - strlcpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); + strscpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); #endif #endif diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index e715df5385d6..5656395c95ee 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -472,9 +472,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, #define pte_same(A,B) (pte_val(A) == pte_val(B)) -struct seq_file; -extern void arch_report_meminfo(struct seq_file *m); - #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 9972626ddaf6..6a88bfdaa69b 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -165,9 +165,6 @@ static inline bool is_ioremap_addr(const void *x) return addr >= IOREMAP_BASE && addr < IOREMAP_END; } - -struct seq_file; -void arch_report_meminfo(struct seq_file *m); #endif /* CONFIG_PPC64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index ce804b7bf84e..0bd4866d9824 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -795,12 +795,20 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) goto out; if (current->active_mm == mm) { + unsigned long flags; + WARN_ON_ONCE(current->mm != NULL); - /* Is a kernel thread and is using mm as the lazy tlb */ + /* + * It is a kernel thread and is using mm as the lazy tlb, so + * switch it to init_mm. This is not always called from IPI + * (e.g., flush_type_needed), so must disable irqs. + */ + local_irq_save(flags); mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; switch_mm_irqs_off(mm, &init_mm, current); mmdrop_lazy_tlb(mm); + local_irq_restore(flags); } /* diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index acb1f8b53105..c67f59db7a51 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -45,6 +45,13 @@ static void pgtable_populate(unsigned long addr, unsigned long end, enum populat static pte_t pte_z; +static inline void kasan_populate(unsigned long start, unsigned long end, enum populate_mode mode) +{ + start = PAGE_ALIGN_DOWN(__sha(start)); + end = PAGE_ALIGN(__sha(end)); + pgtable_populate(start, end, mode); +} + static void kasan_populate_shadow(void) { pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); @@ -95,17 +102,17 @@ static void kasan_populate_shadow(void) */ for_each_physmem_usable_range(i, &start, &end) - pgtable_populate(__sha(start), __sha(end), POPULATE_KASAN_MAP_SHADOW); + kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { untracked_end = VMALLOC_START; /* shallowly populate kasan shadow for vmalloc and modules */ - pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END), POPULATE_KASAN_SHALLOW); + kasan_populate(VMALLOC_START, MODULES_END, POPULATE_KASAN_SHALLOW); } else { untracked_end = MODULES_VADDR; } /* populate kasan shadow for untracked memory */ - pgtable_populate(__sha(ident_map_size), __sha(untracked_end), POPULATE_KASAN_ZERO_SHADOW); - pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE), POPULATE_KASAN_ZERO_SHADOW); + kasan_populate(ident_map_size, untracked_end, POPULATE_KASAN_ZERO_SHADOW); + kasan_populate(MODULES_END, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW); } static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index be3bf03bf361..aa95cf6dfabb 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -116,6 +116,7 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_NET_TC_SKB_EXT=y CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 769c7eed8b6a..f041945f9148 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -107,6 +107,7 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_NET_TC_SKB_EXT=y CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 6822a11c2c8a..c55f3c3365af 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -42,9 +42,6 @@ static inline void update_page_count(int level, long count) atomic_long_add(count, &direct_pages_count[level]); } -struct seq_file; -void arch_report_meminfo(struct seq_file *m); - /* * The S390 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h index 8e9c582592b3..9e41a74fce9a 100644 --- a/arch/s390/include/asm/physmem_info.h +++ b/arch/s390/include/asm/physmem_info.h @@ -3,6 +3,7 @@ #define _ASM_S390_MEM_DETECT_H #include <linux/types.h> +#include <asm/page.h> enum physmem_info_source { MEM_DETECT_NONE = 0, @@ -133,7 +134,7 @@ static inline const char *get_rr_type_name(enum reserved_range_type t) #define for_each_physmem_reserved_type_range(t, range, p_start, p_end) \ for (range = &physmem_info.reserved[t], *p_start = range->start, *p_end = range->end; \ - range && range->end; range = range->chain, \ + range && range->end; range = range->chain ? __va(range->chain) : NULL, \ *p_start = range ? range->start : 0, *p_end = range ? range->end : 0) static inline struct reserved_range *__physmem_reserved_next(enum reserved_range_type *t, @@ -145,7 +146,7 @@ static inline struct reserved_range *__physmem_reserved_next(enum reserved_range return range; } if (range->chain) - return range->chain; + return __va(range->chain); while (++*t < RR_MAX) { range = &physmem_info.reserved[*t]; if (range->end) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 8a617be28bb4..7af69948b290 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -568,9 +568,9 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { Elf64_Phdr *phdr_notes, *phdr_loads; + size_t alloc_size; int mem_chunk_cnt; void *ptr, *hdr; - u32 alloc_size; u64 hdr_off; /* If we are not in kdump or zfcp/nvme dump mode return */ diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 5ba3bd8a7b12..ca5a418c58a8 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -4,6 +4,7 @@ * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ #include <linux/hugetlb.h> +#include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <asm/cacheflush.h> diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 5b22c6e24528..b9dcb4ae6c59 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -667,7 +667,15 @@ static void __init memblock_region_swap(void *a, void *b, int size) #ifdef CONFIG_KASAN #define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) + +static inline int set_memory_kasan(unsigned long start, unsigned long end) +{ + start = PAGE_ALIGN_DOWN(__sha(start)); + end = PAGE_ALIGN(__sha(end)); + return set_memory_rwnx(start, (end - start) >> PAGE_SHIFT); +} #endif + /* * map whole physical memory to virtual memory (identity mapping) * we reserve enough space in the vmalloc area for vmemmap to hotplug @@ -737,10 +745,8 @@ void __init vmem_map_init(void) } #ifdef CONFIG_KASAN - for_each_mem_range(i, &base, &end) { - set_memory_rwnx(__sha(base), - (__sha(end) - __sha(base)) >> PAGE_SHIFT); - } + for_each_mem_range(i, &base, &end) + set_memory_kasan(base, end); #endif set_memory_rox((unsigned long)_stext, (unsigned long)(_etext - _stext) >> PAGE_SHIFT); diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b39975977c03..fdc2e3abd615 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -305,6 +305,18 @@ ifeq ($(RETPOLINE_CFLAGS),) endif endif +ifdef CONFIG_UNWINDER_ORC +orc_hash_h := arch/$(SRCARCH)/include/generated/asm/orc_hash.h +orc_hash_sh := $(srctree)/scripts/orc_hash.sh +targets += $(orc_hash_h) +quiet_cmd_orc_hash = GEN $@ + cmd_orc_hash = mkdir -p $(dir $@); \ + $(CONFIG_SHELL) $(orc_hash_sh) < $< > $@ +$(orc_hash_h): $(srctree)/arch/x86/include/asm/orc_types.h $(orc_hash_sh) FORCE + $(call if_changed,orc_hash) +archprepare: $(orc_hash_h) +endif + archclean: $(Q)rm -rf $(objtree)/arch/i386 $(Q)rm -rf $(objtree)/arch/x86_64 diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 89b9c1cebb61..27f3a7b34bd5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -349,6 +349,16 @@ static struct event_constraint intel_spr_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_gnr_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -6496,6 +6506,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_EMERALDRAPIDS_X: x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + x86_pmu.extra_regs = intel_spr_extra_regs; fallthrough; case INTEL_FAM6_GRANITERAPIDS_X: case INTEL_FAM6_GRANITERAPIDS_D: @@ -6506,7 +6517,8 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_spr_event_constraints; x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; - x86_pmu.extra_regs = intel_spr_extra_regs; + if (!x86_pmu.extra_regs) + x86_pmu.extra_regs = intel_gnr_extra_regs; x86_pmu.limit_period = spr_limit_period; x86_pmu.pebs_ept = 1; x86_pmu.pebs_aliases = NULL; @@ -6650,6 +6662,7 @@ __init int intel_pmu_init(void) pmu->pebs_constraints = intel_grt_pebs_event_constraints; pmu->extra_regs = intel_grt_extra_regs; if (is_mtl(boot_cpu_data.x86_model)) { + x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_gnr_extra_regs; x86_pmu.pebs_latency_data = mtl_latency_data_small; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a5f9474f08e1..6c04b52f139b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -416,7 +416,7 @@ void __init hyperv_init(void) goto free_vp_assist_page; } - cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online", hv_cpu_init, hv_cpu_die); if (cpuhp < 0) goto free_ghcb_page; diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 1ba5d3b99b16..85d38b9f3586 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -20,6 +20,8 @@ void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = x86_init_noop; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 1e51650b79d7..4f1ce5fc4e19 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +generated-y += orc_hash.h generated-y += syscalls_32.h generated-y += syscalls_64.h generated-y += syscalls_x32.h diff --git a/arch/x86/include/asm/orc_header.h b/arch/x86/include/asm/orc_header.h new file mode 100644 index 000000000000..07bacf3e160e --- /dev/null +++ b/arch/x86/include/asm/orc_header.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#ifndef _ORC_HEADER_H +#define _ORC_HEADER_H + +#include <linux/types.h> +#include <linux/compiler.h> +#include <asm/orc_hash.h> + +/* + * The header is currently a 20-byte hash of the ORC entry definition; see + * scripts/orc_hash.sh. + */ +#define ORC_HEADER \ + __used __section(".orc_header") __aligned(4) \ + static const u8 orc_header[] = { ORC_HASH } + +#endif /* _ORC_HEADER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 15ae4d6ba476..5700bb337987 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -27,6 +27,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); +struct seq_file; void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 447d4bee25c4..ba3e2554799a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -513,9 +513,6 @@ extern void native_pagetable_init(void); #define native_pagetable_init paging_init #endif -struct seq_file; -extern void arch_report_meminfo(struct seq_file *m); - enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6bde05a86b4e..896bc41cb2ba 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -97,7 +97,10 @@ static void init_x2apic_ldr(void) static int x2apic_phys_probe(void) { - if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + if (!x2apic_mode) + return 0; + + if (x2apic_phys || x2apic_fadt_phys()) return 1; return apic == &apic_x2apic_phys; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 3ac50b7298d1..4d8e518365f4 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -7,6 +7,9 @@ #include <asm/unwind.h> #include <asm/orc_types.h> #include <asm/orc_lookup.h> +#include <asm/orc_header.h> + +ORC_HEADER; #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 557f0fe25dff..37db264866b6 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -172,10 +172,10 @@ void __meminit init_trampoline_kaslr(void) set_p4d(p4d_tramp, __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)); } else { - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 7159cf787613..d1515756e369 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <linux/debugfs.h> #include <linux/pfn.h> #include <linux/percpu.h> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1056bbf55b17..438adb695daa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2570,7 +2570,7 @@ out_image: } if (bpf_jit_enable > 1) - bpf_jit_dump(prog->len, proglen, pass + 1, image); + bpf_jit_dump(prog->len, proglen, pass + 1, rw_image); if (image) { if (!prog->is_func || extra_pass) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f0b5c9c41cde..dce1548a7a0c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -970,6 +970,7 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); struct llist_node *lnode; struct blkg_iostat_set *bisc, *next_bisc; + unsigned long flags; rcu_read_lock(); @@ -983,7 +984,7 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) * When flushing from cgroup, cgroup_rstat_lock is always held, so * this lock won't cause contention most of time. */ - raw_spin_lock(&blkg_stat_lock); + raw_spin_lock_irqsave(&blkg_stat_lock, flags); /* * Iterate only the iostat_cpu's queued in the lockless list. @@ -1009,7 +1010,7 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); } - raw_spin_unlock(&blkg_stat_lock); + raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: rcu_read_unlock(); } diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index e42c1f9ffff8..e9a1cb779b30 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -23,6 +23,7 @@ #include <linux/wait.h> #include <drm/drm_file.h> #include <drm/drm_gem.h> +#include <drm/drm_prime.h> #include <drm/drm_print.h> #include <uapi/drm/qaic_accel.h> @@ -616,8 +617,7 @@ static void qaic_free_object(struct drm_gem_object *obj) if (obj->import_attach) { /* DMABUF/PRIME Path */ - dma_buf_detach(obj->import_attach->dmabuf, obj->import_attach); - dma_buf_put(obj->import_attach->dmabuf); + drm_prime_gem_destroy(obj, NULL); } else { /* Private buffer allocation path */ qaic_free_sgt(bo->sgt); diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index ebf8fd373cf7..79bbfe00d241 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -101,8 +101,6 @@ acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, acpi_event_status *event_status); -acpi_status acpi_hw_disable_all_gpes(void); - acpi_status acpi_hw_enable_all_runtime_gpes(void); acpi_status acpi_hw_enable_all_wakeup_gpes(void); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 72470b9f16c4..f32570f72b90 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -636,11 +636,19 @@ static int acpi_suspend_enter(suspend_state_t pm_state) } /* - * Disable and clear GPE status before interrupt is enabled. Some GPEs - * (like wakeup GPE) haven't handler, this can avoid such GPE misfire. - * acpi_leave_sleep_state will reenable specific GPEs later + * Disable all GPE and clear their status bits before interrupts are + * enabled. Some GPEs (like wakeup GPEs) have no handlers and this can + * prevent them from producing spurious interrups. + * + * acpi_leave_sleep_state() will reenable specific GPEs later. + * + * Because this code runs on one CPU with disabled interrupts (all of + * the other CPUs are offline at this time), it need not acquire any + * sleeping locks which may trigger an implicit preemption point even + * if there is no contention, so avoid doing that by using a low-level + * library routine here. */ - acpi_disable_all_gpes(); + acpi_hw_disable_all_gpes(); /* Allow EC transactions to happen. */ acpi_ec_unblock_transactions(); diff --git a/drivers/auxdisplay/ht16k33.c b/drivers/auxdisplay/ht16k33.c index 02425991c159..d44814b9562a 100644 --- a/drivers/auxdisplay/ht16k33.c +++ b/drivers/auxdisplay/ht16k33.c @@ -820,7 +820,7 @@ static const struct of_device_id ht16k33_of_match[] = { MODULE_DEVICE_TABLE(of, ht16k33_of_match); static struct i2c_driver ht16k33_driver = { - .probe_new = ht16k33_probe, + .probe = ht16k33_probe, .remove = ht16k33_remove, .driver = { .name = DRIVER_NAME, diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index 135831a16514..6422be0dfe20 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -365,7 +365,7 @@ static struct i2c_driver lcd2s_i2c_driver = { .name = "lcd2s", .of_match_table = lcd2s_of_table, }, - .probe_new = lcd2s_i2c_probe, + .probe = lcd2s_i2c_probe, .remove = lcd2s_i2c_remove, .id_table = lcd2s_i2c_id, }; diff --git a/drivers/base/regmap/regmap-spi-avmm.c b/drivers/base/regmap/regmap-spi-avmm.c index 4c2b94b3e30b..6af692844c19 100644 --- a/drivers/base/regmap/regmap-spi-avmm.c +++ b/drivers/base/regmap/regmap-spi-avmm.c @@ -660,7 +660,7 @@ static const struct regmap_bus regmap_spi_avmm_bus = { .reg_format_endian_default = REGMAP_ENDIAN_NATIVE, .val_format_endian_default = REGMAP_ENDIAN_NATIVE, .max_raw_read = SPI_AVMM_VAL_SIZE * MAX_READ_CNT, - .max_raw_write = SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, + .max_raw_write = SPI_AVMM_REG_SIZE + SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, .free_context = spi_avmm_bridge_ctx_free, }; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2b918e28acaa..b47358da92a2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -348,63 +348,33 @@ static inline void virtblk_request_done(struct request *req) blk_mq_end_request(req, status); } -static void virtblk_complete_batch(struct io_comp_batch *iob) -{ - struct request *req; - - rq_list_for_each(&iob->req_list, req) { - virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); - virtblk_cleanup_cmd(req); - } - blk_mq_end_request_batch(iob); -} - -static int virtblk_handle_req(struct virtio_blk_vq *vq, - struct io_comp_batch *iob) -{ - struct virtblk_req *vbr; - int req_done = 0; - unsigned int len; - - while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { - struct request *req = blk_mq_rq_from_pdu(vbr); - - if (likely(!blk_should_fake_timeout(req->q)) && - !blk_mq_complete_request_remote(req) && - !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), - virtblk_complete_batch)) - virtblk_request_done(req); - req_done++; - } - - return req_done; -} - static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; - struct virtio_blk_vq *vblk_vq = &vblk->vqs[vq->index]; - int req_done = 0; + bool req_done = false; + int qid = vq->index; + struct virtblk_req *vbr; unsigned long flags; - DEFINE_IO_COMP_BATCH(iob); + unsigned int len; - spin_lock_irqsave(&vblk_vq->lock, flags); + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); do { virtqueue_disable_cb(vq); - req_done += virtblk_handle_req(vblk_vq, &iob); + while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); + req_done = true; + } if (unlikely(virtqueue_is_broken(vq))) break; } while (!virtqueue_enable_cb(vq)); - if (req_done) { - if (!rq_list_empty(iob.req_list)) - iob.complete(&iob); - - /* In case queue is stopped waiting for more buffers. */ + /* In case queue is stopped waiting for more buffers. */ + if (req_done) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); - } - spin_unlock_irqrestore(&vblk_vq->lock, flags); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx) @@ -1283,15 +1253,37 @@ static void virtblk_map_queues(struct blk_mq_tag_set *set) } } +static void virtblk_complete_batch(struct io_comp_batch *iob) +{ + struct request *req; + + rq_list_for_each(&iob->req_list, req) { + virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); + virtblk_cleanup_cmd(req); + } + blk_mq_end_request_batch(iob); +} + static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct virtio_blk *vblk = hctx->queue->queuedata; struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx); + struct virtblk_req *vbr; unsigned long flags; + unsigned int len; int found = 0; spin_lock_irqsave(&vq->lock, flags); - found = virtblk_handle_req(vq, iob); + + while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + + found++; + if (!blk_mq_complete_request_remote(req) && + !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), + virtblk_complete_batch)) + virtblk_request_done(req); + } if (found) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 01f2e86f3f7c..12cf6bb2e3ce 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -12,7 +12,6 @@ #include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/udmabuf.h> -#include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/iosys-map.h> @@ -207,9 +206,7 @@ static long udmabuf_create(struct miscdevice *device, struct udmabuf *ubuf; struct dma_buf *buf; pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit; - struct page *page, *hpage = NULL; - pgoff_t subpgoff, maxsubpgs; - struct hstate *hpstate; + struct page *page; int seals, ret = -EINVAL; u32 i, flags; @@ -245,7 +242,7 @@ static long udmabuf_create(struct miscdevice *device, if (!memfd) goto err; mapping = memfd->f_mapping; - if (!shmem_mapping(mapping) && !is_file_hugepages(memfd)) + if (!shmem_mapping(mapping)) goto err; seals = memfd_fcntl(memfd, F_GET_SEALS, 0); if (seals == -EINVAL) @@ -256,48 +253,16 @@ static long udmabuf_create(struct miscdevice *device, goto err; pgoff = list[i].offset >> PAGE_SHIFT; pgcnt = list[i].size >> PAGE_SHIFT; - if (is_file_hugepages(memfd)) { - hpstate = hstate_file(memfd); - pgoff = list[i].offset >> huge_page_shift(hpstate); - subpgoff = (list[i].offset & - ~huge_page_mask(hpstate)) >> PAGE_SHIFT; - maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT; - } for (pgidx = 0; pgidx < pgcnt; pgidx++) { - if (is_file_hugepages(memfd)) { - if (!hpage) { - hpage = find_get_page_flags(mapping, pgoff, - FGP_ACCESSED); - if (!hpage) { - ret = -EINVAL; - goto err; - } - } - page = hpage + subpgoff; - get_page(page); - subpgoff++; - if (subpgoff == maxsubpgs) { - put_page(hpage); - hpage = NULL; - subpgoff = 0; - pgoff++; - } - } else { - page = shmem_read_mapping_page(mapping, - pgoff + pgidx); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto err; - } + page = shmem_read_mapping_page(mapping, pgoff + pgidx); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto err; } ubuf->pages[pgbuf++] = page; } fput(memfd); memfd = NULL; - if (hpage) { - put_page(hpage); - hpage = NULL; - } } exp_info.ops = &udmabuf_ops; diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index abeff7dc0b58..34b9e7876538 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -361,24 +361,6 @@ static void __init efi_debugfs_init(void) static inline void efi_debugfs_init(void) {} #endif -static void refresh_nv_rng_seed(struct work_struct *work) -{ - u8 seed[EFI_RANDOM_SEED_SIZE]; - - get_random_bytes(seed, sizeof(seed)); - efi.set_variable(L"RandomSeed", &LINUX_EFI_RANDOM_SEED_TABLE_GUID, - EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, sizeof(seed), seed); - memzero_explicit(seed, sizeof(seed)); -} -static int refresh_nv_rng_seed_notification(struct notifier_block *nb, unsigned long action, void *data) -{ - static DECLARE_WORK(work, refresh_nv_rng_seed); - schedule_work(&work); - return NOTIFY_DONE; -} -static struct notifier_block refresh_nv_rng_seed_nb = { .notifier_call = refresh_nv_rng_seed_notification }; - /* * We register the efi subsystem with the firmware subsystem and the * efivars subsystem with the efi subsystem, if the system was booted with @@ -451,9 +433,6 @@ static int __init efisubsys_init(void) platform_device_register_simple("efi_secret", 0, NULL, 0); #endif - if (efi_rt_services_supported(EFI_RT_SUPPORTED_SET_VARIABLE)) - execute_with_initialized_rng(&refresh_nv_rng_seed_nb); - return 0; err_remove_group: diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 98939cd4a71e..745e5f67254e 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -221,8 +221,12 @@ static int sifive_gpio_probe(struct platform_device *pdev) return -ENODEV; } - for (i = 0; i < ngpio; i++) - chip->irq_number[i] = platform_get_irq(pdev, i); + for (i = 0; i < ngpio; i++) { + ret = platform_get_irq(pdev, i); + if (ret < 0) + return ret; + chip->irq_number[i] = ret; + } ret = bgpio_init(&chip->gc, dev, 4, chip->base + SIFIVE_GPIO_INPUT_VAL, diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index a7220e04a93e..5be8ad61523e 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1745,7 +1745,7 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gc) } /* Remove all IRQ mappings and delete the domain */ - if (gc->irq.domain) { + if (!gc->irq.domain_is_allocated_externally && gc->irq.domain) { unsigned int irq; for (offset = 0; offset < gc->ngpio; offset++) { @@ -1791,6 +1791,15 @@ int gpiochip_irqchip_add_domain(struct gpio_chip *gc, gc->to_irq = gpiochip_to_irq; gc->irq.domain = domain; + gc->irq.domain_is_allocated_externally = true; + + /* + * Using barrier() here to prevent compiler from reordering + * gc->irq.initialized before adding irqdomain. + */ + barrier(); + + gc->irq.initialized = true; return 0; } diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c index 38dab76ae69e..e2e21ce79510 100644 --- a/drivers/gpu/drm/display/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c @@ -3404,7 +3404,7 @@ int drm_dp_add_payload_part2(struct drm_dp_mst_topology_mgr *mgr, /* Skip failed payloads */ if (payload->vc_start_slot == -1) { - drm_dbg_kms(state->dev, "Part 1 of payload creation for %s failed, skipping part 2\n", + drm_dbg_kms(mgr->dev, "Part 1 of payload creation for %s failed, skipping part 2\n", payload->port->connector->name); return -EIO; } diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 007f26d5f1a4..2f4d09ce027a 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -829,11 +829,22 @@ static void vmbus_wait_for_unload(void) if (completion_done(&vmbus_connection.unload_event)) goto completed; - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); + /* + * In a CoCo VM the synic_message_page is not allocated + * in hv_synic_alloc(). Instead it is set/cleared in + * hv_synic_enable_regs() and hv_synic_disable_regs() + * such that it is set only when the CPU is online. If + * not all present CPUs are online, the message page + * might be NULL, so skip such CPUs. + */ page_addr = hv_cpu->synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; @@ -867,11 +878,14 @@ completed: * maybe-pending messages on all CPUs to be able to receive new * messages after we reconnect. */ - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); page_addr = hv_cpu->synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; msg->header.message_type = HVMSG_NONE; } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 64f9ceca887b..542a1d53b303 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -364,13 +364,20 @@ int hv_common_cpu_init(unsigned int cpu) flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); - if (!(*inputarg)) - return -ENOMEM; - if (hv_root_partition) { - outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; + /* + * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already + * allocated if this CPU was previously online and then taken offline + */ + if (!*inputarg) { + *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); + if (!(*inputarg)) + return -ENOMEM; + + if (hv_root_partition) { + outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); + *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; + } } msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); @@ -385,24 +392,17 @@ int hv_common_cpu_init(unsigned int cpu) int hv_common_cpu_die(unsigned int cpu) { - unsigned long flags; - void **inputarg, **outputarg; - void *mem; - - local_irq_save(flags); - - inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - mem = *inputarg; - *inputarg = NULL; - - if (hv_root_partition) { - outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = NULL; - } - - local_irq_restore(flags); - - kfree(mem); + /* + * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory + * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg + * may be used by the Hyper-V vPCI driver in reassigning interrupts + * as part of the offlining process. The interrupt reassignment + * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and + * called this function. + * + * If a previously offlined CPU is brought back online again, the + * originally allocated memory is reused in hv_common_cpu_init(). + */ return 0; } diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 1c65a6dfb9fa..67f95a29aeca 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1372,7 +1372,7 @@ static int vmbus_bus_init(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", hv_synic_init, hv_synic_cleanup); if (ret < 0) - goto err_cpuhp; + goto err_alloc; hyperv_cpuhp_online = ret; ret = vmbus_connect(); @@ -1392,9 +1392,8 @@ static int vmbus_bus_init(void) err_connect: cpuhp_remove_state(hyperv_cpuhp_online); -err_cpuhp: - hv_synic_free(); err_alloc: + hv_synic_free(); if (vmbus_irq == -1) { hv_remove_vmbus_handler(); } else { diff --git a/drivers/i2c/busses/i2c-imx-lpi2c.c b/drivers/i2c/busses/i2c-imx-lpi2c.c index 1af0a637d7f1..4d24ceb57ee7 100644 --- a/drivers/i2c/busses/i2c-imx-lpi2c.c +++ b/drivers/i2c/busses/i2c-imx-lpi2c.c @@ -201,8 +201,8 @@ static void lpi2c_imx_stop(struct lpi2c_imx_struct *lpi2c_imx) /* CLKLO = I2C_CLK_RATIO * CLKHI, SETHOLD = CLKHI, DATAVD = CLKHI/2 */ static int lpi2c_imx_config(struct lpi2c_imx_struct *lpi2c_imx) { - u8 prescale, filt, sethold, clkhi, clklo, datavd; - unsigned int clk_rate, clk_cycle; + u8 prescale, filt, sethold, datavd; + unsigned int clk_rate, clk_cycle, clkhi, clklo; enum lpi2c_imx_pincfg pincfg; unsigned int temp; diff --git a/drivers/i2c/busses/i2c-qup.c b/drivers/i2c/busses/i2c-qup.c index 2e153f2f71b6..78682388e02e 100644 --- a/drivers/i2c/busses/i2c-qup.c +++ b/drivers/i2c/busses/i2c-qup.c @@ -1752,16 +1752,21 @@ nodma: if (!clk_freq || clk_freq > I2C_MAX_FAST_MODE_PLUS_FREQ) { dev_err(qup->dev, "clock frequency not supported %d\n", clk_freq); - return -EINVAL; + ret = -EINVAL; + goto fail_dma; } qup->base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(qup->base)) - return PTR_ERR(qup->base); + if (IS_ERR(qup->base)) { + ret = PTR_ERR(qup->base); + goto fail_dma; + } qup->irq = platform_get_irq(pdev, 0); - if (qup->irq < 0) - return qup->irq; + if (qup->irq < 0) { + ret = qup->irq; + goto fail_dma; + } if (has_acpi_companion(qup->dev)) { ret = device_property_read_u32(qup->dev, @@ -1775,13 +1780,15 @@ nodma: qup->clk = devm_clk_get(qup->dev, "core"); if (IS_ERR(qup->clk)) { dev_err(qup->dev, "Could not get core clock\n"); - return PTR_ERR(qup->clk); + ret = PTR_ERR(qup->clk); + goto fail_dma; } qup->pclk = devm_clk_get(qup->dev, "iface"); if (IS_ERR(qup->pclk)) { dev_err(qup->dev, "Could not get iface clock\n"); - return PTR_ERR(qup->pclk); + ret = PTR_ERR(qup->pclk); + goto fail_dma; } qup_i2c_enable_clocks(qup); src_clk_freq = clk_get_rate(qup->clk); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index dc1ec6849775..e8a2e5984acb 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2078,10 +2078,6 @@ static struct protection_domain *protection_domain_alloc(unsigned int type) int mode = DEFAULT_PGTABLE_LEVEL; int ret; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - /* * Force IOMMU v1 page table when iommu=pt and * when allocating domain for pass-through devices. @@ -2097,6 +2093,10 @@ static struct protection_domain *protection_domain_alloc(unsigned int type) return NULL; } + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + switch (pgtable) { case AMD_IOMMU_V1: ret = protection_domain_init_v1(domain, mode); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9e0c69958587..acffed750e3e 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_mutex without holding cmd->root_lock + * - must take shrinker_rwsem without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index b9461faa9f0d..9dd0409848ab 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1891,7 +1891,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs pmd root_lock). - * - must take shrinker_mutex without holding pmd->root_lock + * - must take shrinker_rwsem without holding pmd->root_lock */ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, THIN_MAX_CONCURRENT_LOCKS); diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c index 8648f7e63ca1..eea208856ce0 100644 --- a/drivers/mmc/host/bcm2835.c +++ b/drivers/mmc/host/bcm2835.c @@ -1403,8 +1403,8 @@ static int bcm2835_probe(struct platform_device *pdev) host->max_clk = clk_get_rate(clk); host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto err; } diff --git a/drivers/mmc/host/litex_mmc.c b/drivers/mmc/host/litex_mmc.c index 39c6707fdfdb..9af6b0902efe 100644 --- a/drivers/mmc/host/litex_mmc.c +++ b/drivers/mmc/host/litex_mmc.c @@ -649,6 +649,7 @@ static struct platform_driver litex_mmc_driver = { .driver = { .name = "litex-mmc", .of_match_table = litex_match, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; module_platform_driver(litex_mmc_driver); diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index b8514d9d5e73..ee9a25b900ae 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -991,11 +991,8 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id) if (data && !cmd->error) data->bytes_xfered = data->blksz * data->blocks; - if (meson_mmc_bounce_buf_read(data) || - meson_mmc_get_next_command(cmd)) - ret = IRQ_WAKE_THREAD; - else - ret = IRQ_HANDLED; + + return IRQ_WAKE_THREAD; } out: @@ -1007,9 +1004,6 @@ out: writel(start, host->regs + SD_EMMC_START); } - if (ret == IRQ_HANDLED) - meson_mmc_request_done(host->mmc, cmd->mrq); - return ret; } @@ -1192,8 +1186,8 @@ static int meson_mmc_probe(struct platform_device *pdev) return PTR_ERR(host->regs); host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) - return -EINVAL; + if (host->irq < 0) + return host->irq; cd_irq = platform_get_irq_optional(pdev, 1); mmc_gpio_set_cd_irq(mmc, cd_irq); diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index f2b2e8b0574e..696cbef3ff7d 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1735,7 +1735,8 @@ static void mmci_set_max_busy_timeout(struct mmc_host *mmc) return; if (host->variant->busy_timeout && mmc->actual_clock) - max_busy_timeout = ~0UL / (mmc->actual_clock / MSEC_PER_SEC); + max_busy_timeout = U32_MAX / DIV_ROUND_UP(mmc->actual_clock, + MSEC_PER_SEC); mmc->max_busy_timeout = max_busy_timeout; } diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index edade0e54a0c..9785ec91654f 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -2680,7 +2680,7 @@ static int msdc_drv_probe(struct platform_device *pdev) host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { - ret = -EINVAL; + ret = host->irq; goto host_free; } diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 629efbe639c4..b4f6a0a2fcb5 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -704,7 +704,7 @@ static int mvsd_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); if (irq < 0) - return -ENXIO; + return irq; mmc = mmc_alloc_host(sizeof(struct mvsd_host), &pdev->dev); if (!mmc) { diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index ce78edfb402b..86454f1182bb 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1343,7 +1343,7 @@ static int mmc_omap_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) - return -ENXIO; + return irq; host->virt_base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(host->virt_base)) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 517dde777413..1e0f2d7774bd 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1791,9 +1791,11 @@ static int omap_hsmmc_probe(struct platform_device *pdev) } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - irq = platform_get_irq(pdev, 0); - if (res == NULL || irq < 0) + if (!res) return -ENXIO; + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(base)) diff --git a/drivers/mmc/host/owl-mmc.c b/drivers/mmc/host/owl-mmc.c index 6f9d31a886ba..1bf22b08b373 100644 --- a/drivers/mmc/host/owl-mmc.c +++ b/drivers/mmc/host/owl-mmc.c @@ -637,7 +637,7 @@ static int owl_mmc_probe(struct platform_device *pdev) owl_host->irq = platform_get_irq(pdev, 0); if (owl_host->irq < 0) { - ret = -EINVAL; + ret = owl_host->irq; goto err_release_channel; } diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 8f0e639236b1..edf2e6c14dc6 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -829,7 +829,7 @@ static int sdhci_acpi_probe(struct platform_device *pdev) host->ops = &sdhci_acpi_ops_dflt; host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { - err = -EINVAL; + err = host->irq; goto err_free; } diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 8ac81d57a3df..1877d583fe8c 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -2479,6 +2479,9 @@ static inline void sdhci_msm_get_of_property(struct platform_device *pdev, msm_host->ddr_config = DDR_CONFIG_POR_VAL; of_property_read_u32(node, "qcom,dll-config", &msm_host->dll_config); + + if (of_device_is_compatible(node, "qcom,msm8916-sdhci")) + host->quirks2 |= SDHCI_QUIRK2_BROKEN_64_BIT_DMA; } static int sdhci_msm_gcc_reset(struct device *dev, struct sdhci_host *host) diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c index d463e2fd5b1a..c79035727b20 100644 --- a/drivers/mmc/host/sdhci-spear.c +++ b/drivers/mmc/host/sdhci-spear.c @@ -65,8 +65,8 @@ static int sdhci_probe(struct platform_device *pdev) host->hw_name = "sdhci"; host->ops = &sdhci_pltfm_ops; host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto err_host; } host->quirks = SDHCI_QUIRK_BROKEN_ADMA; diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 0fd4c9d644dd..5cf53348372a 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1400,7 +1400,7 @@ static int sh_mmcif_probe(struct platform_device *pdev) irq[0] = platform_get_irq(pdev, 0); irq[1] = platform_get_irq_optional(pdev, 1); if (irq[0] < 0) - return -ENXIO; + return irq[0]; reg = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(reg)) diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index 3db9f32d6a7b..69dcb8805e05 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1350,8 +1350,8 @@ static int sunxi_mmc_resource_request(struct sunxi_mmc_host *host, return ret; host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto error_disable_mmc; } diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 2f59917b105e..2e17903658fc 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -1757,8 +1757,10 @@ static int usdhi6_probe(struct platform_device *pdev) irq_cd = platform_get_irq_byname(pdev, "card detect"); irq_sd = platform_get_irq_byname(pdev, "data"); irq_sdio = platform_get_irq_byname(pdev, "SDIO"); - if (irq_sd < 0 || irq_sdio < 0) - return -ENODEV; + if (irq_sd < 0) + return irq_sd; + if (irq_sdio < 0) + return irq_sdio; mmc = mmc_alloc_host(sizeof(struct usdhi6_host), dev); if (!mmc) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 9bc54e1348cb..7e773c4ba046 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -399,6 +399,20 @@ static void mt7530_pll_setup(struct mt7530_priv *priv) core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN); } +/* If port 6 is available as a CPU port, always prefer that as the default, + * otherwise don't care. + */ +static struct dsa_port * +mt753x_preferred_default_local_cpu_port(struct dsa_switch *ds) +{ + struct dsa_port *cpu_dp = dsa_to_port(ds, 6); + + if (dsa_port_is_cpu(cpu_dp)) + return cpu_dp; + + return NULL; +} + /* Setup port 6 interface mode and TRGMII TX circuit */ static int mt7530_pad_clk_setup(struct dsa_switch *ds, phy_interface_t interface) @@ -985,6 +999,18 @@ unlock_exit: mutex_unlock(&priv->reg_mutex); } +static void +mt753x_trap_frames(struct mt7530_priv *priv) +{ + /* Trap BPDUs to the CPU port(s) */ + mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, + MT753X_BPDU_CPU_ONLY); + + /* Trap LLDP frames with :0E MAC DA to the CPU port(s) */ + mt7530_rmw(priv, MT753X_RGAC2, MT753X_R0E_PORT_FW_MASK, + MT753X_R0E_PORT_FW(MT753X_BPDU_CPU_ONLY)); +} + static int mt753x_cpu_port_enable(struct dsa_switch *ds, int port) { @@ -1007,9 +1033,16 @@ mt753x_cpu_port_enable(struct dsa_switch *ds, int port) UNU_FFP(BIT(port))); /* Set CPU port number */ - if (priv->id == ID_MT7621) + if (priv->id == ID_MT7530 || priv->id == ID_MT7621) mt7530_rmw(priv, MT7530_MFC, CPU_MASK, CPU_EN | CPU_PORT(port)); + /* Add the CPU port to the CPU port bitmap for MT7531 and the switch on + * the MT7988 SoC. Trapped frames will be forwarded to the CPU port that + * is affine to the inbound user port. + */ + if (priv->id == ID_MT7531 || priv->id == ID_MT7988) + mt7530_set(priv, MT7531_CFC, MT7531_CPU_PMAP(BIT(port))); + /* CPU port gets connected to all user ports of * the switch. */ @@ -2255,6 +2288,8 @@ mt7530_setup(struct dsa_switch *ds) priv->p6_interface = PHY_INTERFACE_MODE_NA; + mt753x_trap_frames(priv); + /* Enable and reset MIB counters */ mt7530_mib_reset(ds); @@ -2352,17 +2387,9 @@ static int mt7531_setup_common(struct dsa_switch *ds) { struct mt7530_priv *priv = ds->priv; - struct dsa_port *cpu_dp; int ret, i; - /* BPDU to CPU port */ - dsa_switch_for_each_cpu_port(cpu_dp, ds) { - mt7530_rmw(priv, MT7531_CFC, MT7531_CPU_PMAP_MASK, - BIT(cpu_dp->index)); - break; - } - mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, - MT753X_BPDU_CPU_ONLY); + mt753x_trap_frames(priv); /* Enable and reset MIB counters */ mt7530_mib_reset(ds); @@ -3085,6 +3112,7 @@ static int mt7988_setup(struct dsa_switch *ds) const struct dsa_switch_ops mt7530_switch_ops = { .get_tag_protocol = mtk_get_tag_protocol, .setup = mt753x_setup, + .preferred_default_local_cpu_port = mt753x_preferred_default_local_cpu_port, .get_strings = mt7530_get_strings, .get_ethtool_stats = mt7530_get_ethtool_stats, .get_sset_count = mt7530_get_sset_count, diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 5084f48a8869..08045b035e6a 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -54,6 +54,7 @@ enum mt753x_id { #define MT7531_MIRROR_PORT_GET(x) (((x) >> 16) & MIRROR_MASK) #define MT7531_MIRROR_PORT_SET(x) (((x) & MIRROR_MASK) << 16) #define MT7531_CPU_PMAP_MASK GENMASK(7, 0) +#define MT7531_CPU_PMAP(x) FIELD_PREP(MT7531_CPU_PMAP_MASK, x) #define MT753X_MIRROR_REG(id) ((((id) == ID_MT7531) || ((id) == ID_MT7988)) ? \ MT7531_CFC : MT7530_MFC) @@ -66,6 +67,11 @@ enum mt753x_id { #define MT753X_BPC 0x24 #define MT753X_BPDU_PORT_FW_MASK GENMASK(2, 0) +/* Register for :03 and :0E MAC DA frame control */ +#define MT753X_RGAC2 0x2c +#define MT753X_R0E_PORT_FW_MASK GENMASK(18, 16) +#define MT753X_R0E_PORT_FW(x) FIELD_PREP(MT753X_R0E_PORT_FW_MASK, x) + enum mt753x_bpdu_port_fw { MT753X_BPDU_FOLLOW_MFC, MT753X_BPDU_CPU_EXCLUDE = 4, diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 7e408bcc88de..0defd519ba62 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1135,8 +1135,8 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter, eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ? VLAN_ETH_HLEN : ETH_HLEN; if (skb->len <= 60 && - (lancer_chip(adapter) || skb_vlan_tag_present(skb)) && - is_ipv4_pkt(skb)) { + (lancer_chip(adapter) || BE3_chip(adapter) || + skb_vlan_tag_present(skb)) && is_ipv4_pkt(skb)) { ip = (struct iphdr *)ip_hdr(skb); pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len)); } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c index b1871e6c4006..00e50bd30189 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c @@ -54,6 +54,9 @@ static int phy_mode(enum dpmac_eth_if eth_if, phy_interface_t *if_mode) case DPMAC_ETH_IF_XFI: *if_mode = PHY_INTERFACE_MODE_10GBASER; break; + case DPMAC_ETH_IF_CAUI: + *if_mode = PHY_INTERFACE_MODE_25GBASER; + break; default: return -EINVAL; } @@ -79,6 +82,8 @@ static enum dpmac_eth_if dpmac_eth_if_mode(phy_interface_t if_mode) return DPMAC_ETH_IF_XFI; case PHY_INTERFACE_MODE_1000BASEX: return DPMAC_ETH_IF_1000BASEX; + case PHY_INTERFACE_MODE_25GBASER: + return DPMAC_ETH_IF_CAUI; default: return DPMAC_ETH_IF_MII; } @@ -418,7 +423,7 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac) mac->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD | MAC_2500FD | MAC_5000FD | - MAC_10000FD; + MAC_10000FD | MAC_25000FD; dpaa2_mac_set_supported_interfaces(mac); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 9c94807097cb..5ce28ff7685f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -732,7 +732,8 @@ static void mlx5e_rx_compute_wqe_bulk_params(struct mlx5e_params *params, static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, - struct mlx5e_rq_frags_info *info) + struct mlx5e_rq_frags_info *info, + u32 *xdp_frag_size) { u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu); int frag_size_max = DEFAULT_FRAG_SIZE; @@ -845,6 +846,8 @@ out: info->log_num_frags = order_base_2(info->num_frags); + *xdp_frag_size = info->num_frags > 1 && params->xdp_prog ? PAGE_SIZE : 0; + return 0; } @@ -989,7 +992,8 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, } default: /* MLX5_WQ_TYPE_CYCLIC */ MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames); - err = mlx5e_build_rq_frags_info(mdev, params, xsk, ¶m->frags_info); + err = mlx5e_build_rq_frags_info(mdev, params, xsk, ¶m->frags_info, + ¶m->xdp_frag_size); if (err) return err; ndsegs = param->frags_info.num_frags; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index a5d20f6d6d9c..6800949dafbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -24,6 +24,7 @@ struct mlx5e_rq_param { u32 rqc[MLX5_ST_SZ_DW(rqc)]; struct mlx5_wq_param wq; struct mlx5e_rq_frags_info frags_info; + u32 xdp_frag_size; }; struct mlx5e_sq_param { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index ead38ef69483..a254e728ac95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -2021,6 +2021,8 @@ void mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *attr) { + if (!attr->ct_attr.ft) /* no ct action, return */ + return; if (!attr->ct_attr.nf_ft) /* means only ct clear action, and not ct_clear,ct() */ return; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index ed279f450976..36826b582484 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -86,7 +86,7 @@ static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, if (err) return err; - return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0); + return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, c->napi.napi_id); } static int mlx5e_open_xsk_rq(struct mlx5e_channel *c, struct mlx5e_params *params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 55b38544422f..891d39b4bfd4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -61,16 +61,19 @@ static void mlx5e_ipsec_handle_tx_limit(struct work_struct *_work) struct mlx5e_ipsec_sa_entry *sa_entry = dwork->sa_entry; struct xfrm_state *x = sa_entry->x; - spin_lock(&x->lock); + if (sa_entry->attrs.drop) + return; + + spin_lock_bh(&x->lock); xfrm_state_check_expire(x); if (x->km.state == XFRM_STATE_EXPIRED) { sa_entry->attrs.drop = true; - mlx5e_accel_ipsec_fs_modify(sa_entry); - } - spin_unlock(&x->lock); + spin_unlock_bh(&x->lock); - if (sa_entry->attrs.drop) + mlx5e_accel_ipsec_fs_modify(sa_entry); return; + } + spin_unlock_bh(&x->lock); queue_delayed_work(sa_entry->ipsec->wq, &dwork->dwork, MLX5_IPSEC_RESCHED); @@ -1040,11 +1043,17 @@ err_fs: return err; } -static void mlx5e_xfrm_free_policy(struct xfrm_policy *x) +static void mlx5e_xfrm_del_policy(struct xfrm_policy *x) { struct mlx5e_ipsec_pol_entry *pol_entry = to_ipsec_pol_entry(x); mlx5e_accel_ipsec_fs_del_pol(pol_entry); +} + +static void mlx5e_xfrm_free_policy(struct xfrm_policy *x) +{ + struct mlx5e_ipsec_pol_entry *pol_entry = to_ipsec_pol_entry(x); + kfree(pol_entry); } @@ -1065,6 +1074,7 @@ static const struct xfrmdev_ops mlx5e_ipsec_packet_xfrmdev_ops = { .xdo_dev_state_update_curlft = mlx5e_xfrm_update_curlft, .xdo_dev_policy_add = mlx5e_xfrm_add_policy, + .xdo_dev_policy_delete = mlx5e_xfrm_del_policy, .xdo_dev_policy_free = mlx5e_xfrm_free_policy, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index df90e19066bc..a3554bde3e07 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -305,7 +305,17 @@ static void mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry, } mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs); + + /* It is safe to execute the modify below unlocked since the only flows + * that could affect this HW object, are create, destroy and this work. + * + * Creation flow can't co-exist with this modify work, the destruction + * flow would cancel this work, and this work is a single entity that + * can't conflict with it self. + */ + spin_unlock_bh(&sa_entry->x->lock); mlx5_accel_esp_modify_xfrm(sa_entry, &attrs); + spin_lock_bh(&sa_entry->x->lock); data.data_offset_condition_operand = MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET; @@ -431,7 +441,7 @@ static void mlx5e_ipsec_handle_event(struct work_struct *_work) aso = sa_entry->ipsec->aso; attrs = &sa_entry->attrs; - spin_lock(&sa_entry->x->lock); + spin_lock_bh(&sa_entry->x->lock); ret = mlx5e_ipsec_aso_query(sa_entry, NULL); if (ret) goto unlock; @@ -447,7 +457,7 @@ static void mlx5e_ipsec_handle_event(struct work_struct *_work) mlx5e_ipsec_handle_limits(sa_entry); unlock: - spin_unlock(&sa_entry->x->lock); + spin_unlock_bh(&sa_entry->x->lock); kfree(work); } @@ -596,7 +606,8 @@ int mlx5e_ipsec_aso_query(struct mlx5e_ipsec_sa_entry *sa_entry, do { ret = mlx5_aso_poll_cq(aso->aso, false); if (ret) - usleep_range(2, 10); + /* We are in atomic context */ + udelay(10); } while (ret && time_is_after_jiffies(expires)); spin_unlock_bh(&aso->lock); return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a7c526ee5024..a5bdf78955d7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -641,7 +641,7 @@ static void mlx5e_free_mpwqe_rq_drop_page(struct mlx5e_rq *rq) } static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *params, - struct mlx5e_rq *rq) + u32 xdp_frag_size, struct mlx5e_rq *rq) { struct mlx5_core_dev *mdev = c->mdev; int err; @@ -665,7 +665,8 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param if (err) return err; - return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, c->napi.napi_id); + return __xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, c->napi.napi_id, + xdp_frag_size); } static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, @@ -2240,7 +2241,7 @@ static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param { int err; - err = mlx5e_init_rxq_rq(c, params, &c->rq); + err = mlx5e_init_rxq_rq(c, params, rq_params->xdp_frag_size, &c->rq); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 8a5a8703f0a3..b9b1da751a3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1439,6 +1439,7 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv, mlx5e_hairpin_flow_del(priv, flow); free_flow_post_acts(flow); + mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr); kvfree(attr->parse_attr); kfree(flow->attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 144e59480686..ec83e6483d1a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -511,10 +511,11 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, struct mlx5_flow_rule *dst; void *in_flow_context, *vlan; void *in_match_value; + int reformat_id = 0; unsigned int inlen; int dst_cnt_size; + u32 *in, action; void *in_dests; - u32 *in; int err; if (mlx5_set_extended_dest(dev, fte, &extended_dest)) @@ -553,22 +554,42 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, MLX5_SET(flow_context, in_flow_context, extended_destination, extended_dest); - if (extended_dest) { - u32 action; - action = fte->action.action & - ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; - MLX5_SET(flow_context, in_flow_context, action, action); - } else { - MLX5_SET(flow_context, in_flow_context, action, - fte->action.action); - if (fte->action.pkt_reformat) - MLX5_SET(flow_context, in_flow_context, packet_reformat_id, - fte->action.pkt_reformat->id); + action = fte->action.action; + if (extended_dest) + action &= ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + + MLX5_SET(flow_context, in_flow_context, action, action); + + if (!extended_dest && fte->action.pkt_reformat) { + struct mlx5_pkt_reformat *pkt_reformat = fte->action.pkt_reformat; + + if (pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_SW) { + reformat_id = mlx5_fs_dr_action_get_pkt_reformat_id(pkt_reformat); + if (reformat_id < 0) { + mlx5_core_err(dev, + "Unsupported SW-owned pkt_reformat type (%d) in FW-owned table\n", + pkt_reformat->reformat_type); + err = reformat_id; + goto err_out; + } + } else { + reformat_id = fte->action.pkt_reformat->id; + } } - if (fte->action.modify_hdr) + + MLX5_SET(flow_context, in_flow_context, packet_reformat_id, (u32)reformat_id); + + if (fte->action.modify_hdr) { + if (fte->action.modify_hdr->owner == MLX5_FLOW_RESOURCE_OWNER_SW) { + mlx5_core_err(dev, "Can't use SW-owned modify_hdr in FW-owned table\n"); + err = -EOPNOTSUPP; + goto err_out; + } + MLX5_SET(flow_context, in_flow_context, modify_header_id, fte->action.modify_hdr->id); + } MLX5_SET(flow_context, in_flow_context, encrypt_decrypt_type, fte->action.crypto.type); @@ -885,6 +906,8 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, pkt_reformat->id = MLX5_GET(alloc_packet_reformat_context_out, out, packet_reformat_id); + pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_FW; + kfree(in); return err; } @@ -969,6 +992,7 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); modify_hdr->id = MLX5_GET(alloc_modify_header_context_out, out, modify_header_id); + modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_FW; kfree(in); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index f137a0611b77..b043190e50a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -54,8 +54,14 @@ struct mlx5_flow_definer { u32 id; }; +enum mlx5_flow_resource_owner { + MLX5_FLOW_RESOURCE_OWNER_FW, + MLX5_FLOW_RESOURCE_OWNER_SW, +}; + struct mlx5_modify_hdr { enum mlx5_flow_namespace_type ns_type; + enum mlx5_flow_resource_owner owner; union { struct mlx5_fs_dr_action action; u32 id; @@ -65,6 +71,7 @@ struct mlx5_modify_hdr { struct mlx5_pkt_reformat { enum mlx5_flow_namespace_type ns_type; int reformat_type; /* from mlx5_ifc */ + enum mlx5_flow_resource_owner owner; union { struct mlx5_fs_dr_action action; u32 id; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 843da89a9035..98412bd5a696 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -126,14 +126,22 @@ out: return ret; } -static void irq_release(struct mlx5_irq *irq) +/* mlx5_system_free_irq - Free an IRQ + * @irq: IRQ to free + * + * Free the IRQ and other resources such as rmap from the system. + * BUT doesn't free or remove reference from mlx5. + * This function is very important for the shutdown flow, where we need to + * cleanup system resoruces but keep mlx5 objects alive, + * see mlx5_irq_table_free_irqs(). + */ +static void mlx5_system_free_irq(struct mlx5_irq *irq) { struct mlx5_irq_pool *pool = irq->pool; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rmap; #endif - xa_erase(&pool->irqs, irq->pool_index); /* free_irq requires that affinity_hint and rmap will be cleared before * calling it. To satisfy this requirement, we call * irq_cpu_rmap_remove() to remove the notifier @@ -145,10 +153,18 @@ static void irq_release(struct mlx5_irq *irq) irq_cpu_rmap_remove(rmap, irq->map.virq); #endif - free_cpumask_var(irq->mask); free_irq(irq->map.virq, &irq->nh); if (irq->map.index && pci_msix_can_alloc_dyn(pool->dev->pdev)) pci_msix_free_irq(pool->dev->pdev, irq->map); +} + +static void irq_release(struct mlx5_irq *irq) +{ + struct mlx5_irq_pool *pool = irq->pool; + + xa_erase(&pool->irqs, irq->pool_index); + mlx5_system_free_irq(irq); + free_cpumask_var(irq->mask); kfree(irq); } @@ -565,15 +581,21 @@ void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs) int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs, struct mlx5_irq **irqs, struct cpu_rmap **rmap) { + struct mlx5_irq_table *table = mlx5_irq_table_get(dev); + struct mlx5_irq_pool *pool = table->pcif_pool; struct irq_affinity_desc af_desc; struct mlx5_irq *irq; + int offset = 1; int i; + if (!pool->xa_num_irqs.max) + offset = 0; + af_desc.is_managed = false; for (i = 0; i < nirqs; i++) { cpumask_clear(&af_desc.mask); cpumask_set_cpu(cpus[i], &af_desc.mask); - irq = mlx5_irq_request(dev, i + 1, &af_desc, rmap); + irq = mlx5_irq_request(dev, i + offset, &af_desc, rmap); if (IS_ERR(irq)) break; irqs[i] = irq; @@ -699,7 +721,8 @@ static void mlx5_irq_pool_free_irqs(struct mlx5_irq_pool *pool) unsigned long index; xa_for_each(&pool->irqs, index, irq) - free_irq(irq->map.virq, &irq->nh); + mlx5_system_free_irq(irq); + } static void mlx5_irq_pools_free_irqs(struct mlx5_irq_table *table) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 0eb9a8d7f282..0f783e7906cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -1421,9 +1421,13 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, } case DR_ACTION_TYP_TNL_L3_TO_L2: { - u8 hw_actions[DR_ACTION_CACHE_LINE_SIZE] = {}; + u8 *hw_actions; int ret; + hw_actions = kzalloc(DR_ACTION_CACHE_LINE_SIZE, GFP_KERNEL); + if (!hw_actions) + return -ENOMEM; + ret = mlx5dr_ste_set_action_decap_l3_list(dmn->ste_ctx, data, data_sz, hw_actions, @@ -1431,6 +1435,7 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, &action->rewrite->num_of_actions); if (ret) { mlx5dr_dbg(dmn, "Failed creating decap l3 action list\n"); + kfree(hw_actions); return ret; } @@ -1440,6 +1445,7 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, ret = mlx5dr_ste_alloc_modify_hdr(action); if (ret) { mlx5dr_dbg(dmn, "Failed preparing reformat data\n"); + kfree(hw_actions); return ret; } return 0; @@ -2129,6 +2135,11 @@ mlx5dr_action_create_aso(struct mlx5dr_domain *dmn, u32 obj_id, return action; } +u32 mlx5dr_action_get_pkt_reformat_id(struct mlx5dr_action *action) +{ + return action->reformat->id; +} + int mlx5dr_action_destroy(struct mlx5dr_action *action) { if (WARN_ON_ONCE(refcount_read(&action->refcount) > 1)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 984653756779..cc215beb7436 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -331,8 +331,16 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, } if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) { - bool is_decap = fte->action.pkt_reformat->reformat_type == - MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + bool is_decap; + + if (fte->action.pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_FW) { + err = -EINVAL; + mlx5dr_err(domain, "FW-owned reformat can't be used in SW rule\n"); + goto free_actions; + } + + is_decap = fte->action.pkt_reformat->reformat_type == + MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; if (is_decap) actions[num_actions++] = @@ -661,6 +669,7 @@ static int mlx5_cmd_dr_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns return -EINVAL; } + pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_SW; pkt_reformat->action.dr_action = action; return 0; @@ -691,6 +700,7 @@ static int mlx5_cmd_dr_modify_header_alloc(struct mlx5_flow_root_namespace *ns, return -EINVAL; } + modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_SW; modify_hdr->action.dr_action = action; return 0; @@ -816,6 +826,19 @@ static u32 mlx5_cmd_dr_get_capabilities(struct mlx5_flow_root_namespace *ns, return steering_caps; } +int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +{ + switch (pkt_reformat->reformat_type) { + case MLX5_REFORMAT_TYPE_L2_TO_VXLAN: + case MLX5_REFORMAT_TYPE_L2_TO_NVGRE: + case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + case MLX5_REFORMAT_TYPE_INSERT_HDR: + return mlx5dr_action_get_pkt_reformat_id(pkt_reformat->action.dr_action); + } + return -EOPNOTSUPP; +} + bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) { return mlx5dr_is_supported(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h index d168622063d5..99a3b2eff6b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h @@ -38,6 +38,8 @@ struct mlx5_fs_dr_table { bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev); +int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat); + const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void); #else @@ -47,6 +49,11 @@ static inline const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void) return NULL; } +static inline u32 mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +{ + return 0; +} + static inline bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) { return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index 9afd268a2573..d1c04f43d86d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -150,6 +150,8 @@ mlx5dr_action_create_dest_match_range(struct mlx5dr_domain *dmn, int mlx5dr_action_destroy(struct mlx5dr_action *action); +u32 mlx5dr_action_get_pkt_reformat_id(struct mlx5dr_action *action); + int mlx5dr_definer_get(struct mlx5dr_domain *dmn, u16 format_id, u8 *dw_selectors, u8 *byte_selectors, u8 *match_mask, u32 *definer_id); diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index c865a4be05ee..4a1b94e5a8ea 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -582,8 +582,7 @@ qcaspi_spi_thread(void *data) while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if ((qca->intr_req == qca->intr_svc) && - (qca->txr.skb[qca->txr.head] == NULL) && - (qca->sync == QCASPI_SYNC_READY)) + !qca->txr.skb[qca->txr.head]) schedule(); set_current_state(TASK_RUNNING); diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index d30459dbfe8f..b63e47af6365 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -2950,7 +2950,7 @@ static u32 efx_ef10_extract_event_ts(efx_qword_t *event) return tstamp; } -static void +static int efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) { struct efx_nic *efx = channel->efx; @@ -2958,13 +2958,14 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) unsigned int tx_ev_desc_ptr; unsigned int tx_ev_q_label; unsigned int tx_ev_type; + int work_done; u64 ts_part; if (unlikely(READ_ONCE(efx->reset_pending))) - return; + return 0; if (unlikely(EFX_QWORD_FIELD(*event, ESF_DZ_TX_DROP_EVENT))) - return; + return 0; /* Get the transmit queue */ tx_ev_q_label = EFX_QWORD_FIELD(*event, ESF_DZ_TX_QLABEL); @@ -2973,8 +2974,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) if (!tx_queue->timestamping) { /* Transmit completion */ tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, ESF_DZ_TX_DESCR_INDX); - efx_xmit_done(tx_queue, tx_ev_desc_ptr & tx_queue->ptr_mask); - return; + return efx_xmit_done(tx_queue, tx_ev_desc_ptr & tx_queue->ptr_mask); } /* Transmit timestamps are only available for 8XXX series. They result @@ -3000,6 +3000,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) * fields in the event. */ tx_ev_type = EFX_QWORD_FIELD(*event, ESF_EZ_TX_SOFT1); + work_done = 0; switch (tx_ev_type) { case TX_TIMESTAMP_EVENT_TX_EV_COMPLETION: @@ -3016,6 +3017,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) tx_queue->completed_timestamp_major = ts_part; efx_xmit_done_single(tx_queue); + work_done = 1; break; default: @@ -3026,6 +3028,8 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) EFX_QWORD_VAL(*event)); break; } + + return work_done; } static void @@ -3081,13 +3085,16 @@ static void efx_ef10_handle_driver_generated_event(struct efx_channel *channel, } } +#define EFX_NAPI_MAX_TX 512 + static int efx_ef10_ev_process(struct efx_channel *channel, int quota) { struct efx_nic *efx = channel->efx; efx_qword_t event, *p_event; unsigned int read_ptr; - int ev_code; + int spent_tx = 0; int spent = 0; + int ev_code; if (quota <= 0) return spent; @@ -3126,7 +3133,11 @@ static int efx_ef10_ev_process(struct efx_channel *channel, int quota) } break; case ESE_DZ_EV_CODE_TX_EV: - efx_ef10_handle_tx_event(channel, &event); + spent_tx += efx_ef10_handle_tx_event(channel, &event); + if (spent_tx >= EFX_NAPI_MAX_TX) { + spent = quota; + goto out; + } break; case ESE_DZ_EV_CODE_DRIVER_EV: efx_ef10_handle_driver_event(channel, &event); diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index 4dc643b0d2db..7adde9639c8a 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -253,6 +253,8 @@ static void ef100_ev_read_ack(struct efx_channel *channel) efx_reg(channel->efx, ER_GZ_EVQ_INT_PRIME)); } +#define EFX_NAPI_MAX_TX 512 + static int ef100_ev_process(struct efx_channel *channel, int quota) { struct efx_nic *efx = channel->efx; @@ -260,6 +262,7 @@ static int ef100_ev_process(struct efx_channel *channel, int quota) bool evq_phase, old_evq_phase; unsigned int read_ptr; efx_qword_t *p_event; + int spent_tx = 0; int spent = 0; bool ev_phase; int ev_type; @@ -295,7 +298,9 @@ static int ef100_ev_process(struct efx_channel *channel, int quota) efx_mcdi_process_event(channel, p_event); break; case ESE_GZ_EF100_EV_TX_COMPLETION: - ef100_ev_tx(channel, p_event); + spent_tx += ef100_ev_tx(channel, p_event); + if (spent_tx >= EFX_NAPI_MAX_TX) + spent = quota; break; case ESE_GZ_EF100_EV_DRIVER: netif_info(efx, drv, efx->net_dev, diff --git a/drivers/net/ethernet/sfc/ef100_tx.c b/drivers/net/ethernet/sfc/ef100_tx.c index 29ffaf35559d..849e5555bd12 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.c +++ b/drivers/net/ethernet/sfc/ef100_tx.c @@ -346,7 +346,7 @@ void ef100_tx_write(struct efx_tx_queue *tx_queue) ef100_tx_push_buffers(tx_queue); } -void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) +int ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) { unsigned int tx_done = EFX_QWORD_FIELD(*p_event, ESF_GZ_EV_TXCMPL_NUM_DESC); @@ -357,7 +357,7 @@ void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) unsigned int tx_index = (tx_queue->read_count + tx_done - 1) & tx_queue->ptr_mask; - efx_xmit_done(tx_queue, tx_index); + return efx_xmit_done(tx_queue, tx_index); } /* Add a socket buffer to a TX queue diff --git a/drivers/net/ethernet/sfc/ef100_tx.h b/drivers/net/ethernet/sfc/ef100_tx.h index e9e11540fcde..d9a0819c5a72 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.h +++ b/drivers/net/ethernet/sfc/ef100_tx.h @@ -20,7 +20,7 @@ void ef100_tx_init(struct efx_tx_queue *tx_queue); void ef100_tx_write(struct efx_tx_queue *tx_queue); unsigned int ef100_tx_max_skb_descs(struct efx_nic *efx); -void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event); +int ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event); netdev_tx_t ef100_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb); int __ef100_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb, diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c index 67e789b96c43..755aa92bf823 100644 --- a/drivers/net/ethernet/sfc/tx_common.c +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -249,7 +249,7 @@ void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue) } } -void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) +int efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) { unsigned int fill_level, pkts_compl = 0, bytes_compl = 0; unsigned int efv_pkts_compl = 0; @@ -279,6 +279,8 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) } efx_xmit_done_check_empty(tx_queue); + + return pkts_compl + efv_pkts_compl; } /* Remove buffers put into a tx_queue for the current packet. diff --git a/drivers/net/ethernet/sfc/tx_common.h b/drivers/net/ethernet/sfc/tx_common.h index d87aecbc7bf1..1e9f42938aac 100644 --- a/drivers/net/ethernet/sfc/tx_common.h +++ b/drivers/net/ethernet/sfc/tx_common.h @@ -28,7 +28,7 @@ static inline bool efx_tx_buffer_in_use(struct efx_tx_buffer *buffer) } void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue); -void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); +int efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); void efx_enqueue_unwind(struct efx_tx_queue *tx_queue, unsigned int insert_count); diff --git a/drivers/net/ieee802154/adf7242.c b/drivers/net/ieee802154/adf7242.c index f9972b8140f9..a03490ba2e5b 100644 --- a/drivers/net/ieee802154/adf7242.c +++ b/drivers/net/ieee802154/adf7242.c @@ -1348,3 +1348,5 @@ module_spi_driver(adf7242_driver); MODULE_AUTHOR("Michael Hennerich <michael.hennerich@analog.com>"); MODULE_DESCRIPTION("ADF7242 IEEE802.15.4 Transceiver Driver"); MODULE_LICENSE("GPL"); + +MODULE_FIRMWARE(FIRMWARE); diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 8445c2189d11..31cba9aa7636 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -685,7 +685,7 @@ static int hwsim_del_edge_nl(struct sk_buff *msg, struct genl_info *info) static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; - struct hwsim_edge_info *einfo; + struct hwsim_edge_info *einfo, *einfo_old; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; @@ -723,8 +723,10 @@ static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { einfo->lqi = lqi; - rcu_assign_pointer(e->info, einfo); + einfo_old = rcu_replace_pointer(e->info, einfo, + lockdep_is_held(&hwsim_phys_lock)); rcu_read_unlock(); + kfree_rcu(einfo_old, rcu); mutex_unlock(&hwsim_phys_lock); return 0; } diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 76f5a2402fb0..e397e7d642d9 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -936,7 +936,7 @@ static int dp83867_phy_reset(struct phy_device *phydev) { int err; - err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESTART); + err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESET); if (err < 0) return err; diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 389f33a12534..8b3618d3da4a 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -1287,7 +1287,7 @@ EXPORT_SYMBOL_GPL(mdiobus_modify_changed); * @mask: bit mask of bits to clear * @set: bit mask of bits to set */ -int mdiobus_c45_modify_changed(struct mii_bus *bus, int devad, int addr, +int mdiobus_c45_modify_changed(struct mii_bus *bus, int addr, int devad, u32 regnum, u16 mask, u16 set) { int err; diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 17d0d0555a79..53598210be6c 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3021,6 +3021,15 @@ static int phy_led_blink_set(struct led_classdev *led_cdev, return err; } +static void phy_leds_unregister(struct phy_device *phydev) +{ + struct phy_led *phyled; + + list_for_each_entry(phyled, &phydev->leds, list) { + led_classdev_unregister(&phyled->led_cdev); + } +} + static int of_phy_led(struct phy_device *phydev, struct device_node *led) { @@ -3054,7 +3063,7 @@ static int of_phy_led(struct phy_device *phydev, init_data.fwnode = of_fwnode_handle(led); init_data.devname_mandatory = true; - err = devm_led_classdev_register_ext(dev, cdev, &init_data); + err = led_classdev_register_ext(dev, cdev, &init_data); if (err) return err; @@ -3083,6 +3092,7 @@ static int of_phy_leds(struct phy_device *phydev) err = of_phy_led(phydev, led); if (err) { of_node_put(led); + phy_leds_unregister(phydev); return err; } } @@ -3305,6 +3315,9 @@ static int phy_remove(struct device *dev) cancel_delayed_work_sync(&phydev->state_queue); + if (IS_ENABLED(CONFIG_PHYLIB_LEDS)) + phy_leds_unregister(phydev); + phydev->state = PHY_DOWN; sfp_bus_del_upstream(phydev->sfp_bus); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index dba112394838..79115eb1c285 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -548,6 +548,8 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_DEV_INFO(0x54F0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), IWL_DEV_INFO(0x7A70, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name), IWL_DEV_INFO(0x7A70, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), + IWL_DEV_INFO(0x7AF0, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name), + IWL_DEV_INFO(0x7AF0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), IWL_DEV_INFO(0x271C, 0x0214, iwl9260_2ac_cfg, iwl9260_1_name), IWL_DEV_INFO(0x7E40, 0x1691, iwl_cfg_ma_a0_gf4_a0, iwl_ax411_killer_1690s_name), diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c index d6b166fc5c0e..bff46f7ca59f 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c +++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c @@ -626,14 +626,12 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux, if (adth->signature != cpu_to_le32(IOSM_AGGR_MUX_SIG_ADTH)) goto adb_decode_err; - if (le16_to_cpu(adth->table_length) < (sizeof(struct mux_adth) - - sizeof(struct mux_adth_dg))) + if (le16_to_cpu(adth->table_length) < sizeof(struct mux_adth)) goto adb_decode_err; /* Calculate the number of datagrams. */ nr_of_dg = (le16_to_cpu(adth->table_length) - - sizeof(struct mux_adth) + - sizeof(struct mux_adth_dg)) / + sizeof(struct mux_adth)) / sizeof(struct mux_adth_dg); /* Is the datagram table empty ? */ @@ -649,7 +647,7 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux, } /* New aggregated datagram table. */ - dg = &adth->dg; + dg = adth->dg; if (mux_dl_process_dg(ipc_mux, adbh, dg, skb, if_id, nr_of_dg) < 0) goto adb_decode_err; @@ -849,7 +847,7 @@ static void ipc_mux_ul_encode_adth(struct iosm_mux *ipc_mux, adth->if_id = i; adth->table_length = cpu_to_le16(adth_dg_size); adth_dg_size -= offsetof(struct mux_adth, dg); - memcpy(&adth->dg, ul_adb->dg[i], adth_dg_size); + memcpy(adth->dg, ul_adb->dg[i], adth_dg_size); ul_adb->if_cnt++; } @@ -1426,14 +1424,13 @@ static int ipc_mux_get_payload_from_adb(struct iosm_mux *ipc_mux, if (adth->signature == cpu_to_le32(IOSM_AGGR_MUX_SIG_ADTH)) { nr_of_dg = (le16_to_cpu(adth->table_length) - - sizeof(struct mux_adth) + - sizeof(struct mux_adth_dg)) / + sizeof(struct mux_adth)) / sizeof(struct mux_adth_dg); if (nr_of_dg <= 0) return payload_size; - dg = &adth->dg; + dg = adth->dg; for (i = 0; i < nr_of_dg; i++, dg++) { if (le32_to_cpu(dg->datagram_index) < diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h index 5d4e3b89542c..f8df88f816c4 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h +++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h @@ -161,7 +161,7 @@ struct mux_adth { u8 opt_ipv4v6; __le32 next_table_index; __le32 reserved2; - struct mux_adth_dg dg; + struct mux_adth_dg dg[]; }; /** diff --git a/drivers/nfc/fdp/fdp.c b/drivers/nfc/fdp/fdp.c index f12f903a9dd1..da3e2dce8e70 100644 --- a/drivers/nfc/fdp/fdp.c +++ b/drivers/nfc/fdp/fdp.c @@ -762,3 +762,6 @@ EXPORT_SYMBOL(fdp_nci_remove); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("NFC NCI driver for Intel Fields Peak NFC controller"); MODULE_AUTHOR("Robert Dolca <robert.dolca@intel.com>"); + +MODULE_FIRMWARE(FDP_OTP_PATCH_NAME); +MODULE_FIRMWARE(FDP_RAM_PATCH_NAME); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index bc32662c6bb7..2d93d0c4f10d 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -489,7 +489,10 @@ struct hv_pcibus_device { struct fwnode_handle *fwnode; /* Protocol version negotiated with the host */ enum pci_protocol_version_t protocol_version; + + struct mutex state_lock; enum hv_pcibus_state state; + struct hv_device *hdev; resource_size_t low_mmio_space; resource_size_t high_mmio_space; @@ -545,19 +548,10 @@ struct hv_dr_state { struct hv_pcidev_description func[]; }; -enum hv_pcichild_state { - hv_pcichild_init = 0, - hv_pcichild_requirements, - hv_pcichild_resourced, - hv_pcichild_ejecting, - hv_pcichild_maximum -}; - struct hv_pci_dev { /* List protected by pci_rescan_remove_lock */ struct list_head list_entry; refcount_t refs; - enum hv_pcichild_state state; struct pci_slot *pci_slot; struct hv_pcidev_description desc; bool reported_missing; @@ -635,6 +629,11 @@ static void hv_arch_irq_unmask(struct irq_data *data) pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); int_desc = data->chip_data; + if (!int_desc) { + dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", + __func__, data->irq); + return; + } local_irq_save(flags); @@ -2004,12 +2003,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) hv_pci_onchannelcallback(hbus); spin_unlock_irqrestore(&channel->sched_lock, flags); - if (hpdev->state == hv_pcichild_ejecting) { - dev_err_once(&hbus->hdev->device, - "the device is being ejected\n"); - goto enable_tasklet; - } - udelay(100); } @@ -2615,6 +2608,8 @@ static void pci_devices_present_work(struct work_struct *work) if (!dr) return; + mutex_lock(&hbus->state_lock); + /* First, mark all existing children as reported missing. */ spin_lock_irqsave(&hbus->device_list_lock, flags); list_for_each_entry(hpdev, &hbus->children, list_entry) { @@ -2696,6 +2691,8 @@ static void pci_devices_present_work(struct work_struct *work) break; } + mutex_unlock(&hbus->state_lock); + kfree(dr); } @@ -2844,7 +2841,7 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; - WARN_ON(hpdev->state != hv_pcichild_ejecting); + mutex_lock(&hbus->state_lock); /* * Ejection can come before or after the PCI bus has been set up, so @@ -2882,6 +2879,8 @@ static void hv_eject_device_work(struct work_struct *work) put_pcichild(hpdev); put_pcichild(hpdev); /* hpdev has been freed. Do not use it any more. */ + + mutex_unlock(&hbus->state_lock); } /** @@ -2902,7 +2901,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) return; } - hpdev->state = hv_pcichild_ejecting; get_pcichild(hpdev); INIT_WORK(&hpdev->wrk, hv_eject_device_work); queue_work(hbus->wq, &hpdev->wrk); @@ -3331,8 +3329,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev) struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; + bool retry = true; int ret; +enter_d0_retry: /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region @@ -3359,6 +3359,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev) if (ret) goto exit; + /* + * In certain case (Kdump) the pci device of interest was + * not cleanly shut down and resource is still held on host + * side, the host could return invalid device status. + * We need to explicitly request host to release the resource + * and try to enter D0 again. + */ + if (comp_pkt.completion_status < 0 && retry) { + retry = false; + + dev_err(&hdev->device, "Retrying D0 Entry\n"); + + /* + * Hv_pci_bus_exit() calls hv_send_resource_released() + * to free up resources of its child devices. + * In the kdump kernel we need to set the + * wslot_res_allocated to 255 so it scans all child + * devices to release resources allocated in the + * normal kernel before panic happened. + */ + hbus->wslot_res_allocated = 255; + + ret = hv_pci_bus_exit(hdev, true); + + if (ret == 0) { + kfree(pkt); + goto enter_d0_retry; + } + dev_err(&hdev->device, + "Retrying D0 failed with ret %d\n", ret); + } + if (comp_pkt.completion_status < 0) { dev_err(&hdev->device, "PCI Pass-through VSP failed D0 Entry with status %x\n", @@ -3401,6 +3433,24 @@ static int hv_pci_query_relations(struct hv_device *hdev) if (!ret) ret = wait_for_response(hdev, &comp); + /* + * In the case of fast device addition/removal, it's possible that + * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we + * already got a PCI_BUS_RELATIONS* message from the host and the + * channel callback already scheduled a work to hbus->wq, which can be + * running pci_devices_present_work() -> survey_child_resources() -> + * complete(&hbus->survey_event), even after hv_pci_query_relations() + * exits and the stack variable 'comp' is no longer valid; as a result, + * a hang or a page fault may happen when the complete() calls + * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from + * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is + * -ENODEV, there can't be any more work item scheduled to hbus->wq + * after the flush_workqueue(): see vmbus_onoffer_rescind() -> + * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() -> + * channel->rescind = true. + */ + flush_workqueue(hbus->wq); + return ret; } @@ -3586,7 +3636,6 @@ static int hv_pci_probe(struct hv_device *hdev, struct hv_pcibus_device *hbus; u16 dom_req, dom; char *name; - bool enter_d0_retry = true; int ret; bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); @@ -3598,6 +3647,7 @@ static int hv_pci_probe(struct hv_device *hdev, return -ENOMEM; hbus->bridge = bridge; + mutex_init(&hbus->state_lock); hbus->state = hv_pcibus_init; hbus->wslot_res_allocated = -1; @@ -3703,49 +3753,15 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_fwnode; -retry: ret = hv_pci_query_relations(hdev); if (ret) goto free_irq_domain; - ret = hv_pci_enter_d0(hdev); - /* - * In certain case (Kdump) the pci device of interest was - * not cleanly shut down and resource is still held on host - * side, the host could return invalid device status. - * We need to explicitly request host to release the resource - * and try to enter D0 again. - * Since the hv_pci_bus_exit() call releases structures - * of all its child devices, we need to start the retry from - * hv_pci_query_relations() call, requesting host to send - * the synchronous child device relations message before this - * information is needed in hv_send_resources_allocated() - * call later. - */ - if (ret == -EPROTO && enter_d0_retry) { - enter_d0_retry = false; - - dev_err(&hdev->device, "Retrying D0 Entry\n"); - - /* - * Hv_pci_bus_exit() calls hv_send_resources_released() - * to free up resources of its child devices. - * In the kdump kernel we need to set the - * wslot_res_allocated to 255 so it scans all child - * devices to release resources allocated in the - * normal kernel before panic happened. - */ - hbus->wslot_res_allocated = 255; - ret = hv_pci_bus_exit(hdev, true); - - if (ret == 0) - goto retry; + mutex_lock(&hbus->state_lock); - dev_err(&hdev->device, - "Retrying D0 failed with ret %d\n", ret); - } + ret = hv_pci_enter_d0(hdev); if (ret) - goto free_irq_domain; + goto release_state_lock; ret = hv_pci_allocate_bridge_windows(hbus); if (ret) @@ -3763,12 +3779,15 @@ retry: if (ret) goto free_windows; + mutex_unlock(&hbus->state_lock); return 0; free_windows: hv_pci_free_bridge_windows(hbus); exit_d0: (void) hv_pci_bus_exit(hdev, true); +release_state_lock: + mutex_unlock(&hbus->state_lock); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: @@ -4018,20 +4037,26 @@ static int hv_pci_resume(struct hv_device *hdev) if (ret) goto out; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto out; + goto release_state_lock; ret = hv_send_resources_allocated(hdev); if (ret) - goto out; + goto release_state_lock; prepopulate_bars(hbus); hv_pci_restore_msi_state(hbus); hbus->state = hv_pcibus_installed; + mutex_unlock(&hbus->state_lock); return 0; + +release_state_lock: + mutex_unlock(&hbus->state_lock); out: vmbus_close(hdev->channel); return ret; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index c98e4039386d..93b7edb5f1e7 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -677,9 +677,25 @@ static inline u32 armv8pmu_getreset_flags(void) return value; } +static void update_pmuserenr(u64 val) +{ + lockdep_assert_irqs_disabled(); + + /* + * The current PMUSERENR_EL0 value might be the value for the guest. + * If that's the case, have KVM keep tracking of the register value + * for the host EL0 so that KVM can restore it before returning to + * the host EL0. Otherwise, update the register now. + */ + if (kvm_set_pmuserenr(val)) + return; + + write_pmuserenr(val); +} + static void armv8pmu_disable_user_access(void) { - write_pmuserenr(0); + update_pmuserenr(0); } static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) @@ -695,8 +711,7 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) armv8pmu_write_evcntr(i, 0); } - write_pmuserenr(0); - write_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR); + update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR); } static void armv8pmu_enable_event(struct perf_event *event) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index ee5f124f78b6..7780705917b7 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -297,6 +297,8 @@ static void amd_pmf_init_features(struct amd_pmf_dev *dev) /* Enable Static Slider */ if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) { amd_pmf_init_sps(dev); + dev->pwr_src_notifier.notifier_call = amd_pmf_pwr_src_notify_call; + power_supply_reg_notifier(&dev->pwr_src_notifier); dev_dbg(dev->dev, "SPS enabled and Platform Profiles registered\n"); } @@ -315,8 +317,10 @@ static void amd_pmf_init_features(struct amd_pmf_dev *dev) static void amd_pmf_deinit_features(struct amd_pmf_dev *dev) { - if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) + if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) { + power_supply_unreg_notifier(&dev->pwr_src_notifier); amd_pmf_deinit_sps(dev); + } if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { amd_pmf_deinit_auto_mode(dev); @@ -399,9 +403,6 @@ static int amd_pmf_probe(struct platform_device *pdev) apmf_install_handler(dev); amd_pmf_dbgfs_register(dev); - dev->pwr_src_notifier.notifier_call = amd_pmf_pwr_src_notify_call; - power_supply_reg_notifier(&dev->pwr_src_notifier); - dev_info(dev->dev, "registered PMF device successfully\n"); return 0; @@ -411,7 +412,6 @@ static void amd_pmf_remove(struct platform_device *pdev) { struct amd_pmf_dev *dev = platform_get_drvdata(pdev); - power_supply_unreg_notifier(&dev->pwr_src_notifier); amd_pmf_deinit_features(dev); apmf_acpi_deinit(dev); amd_pmf_dbgfs_unregister(dev); diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index a98b781b103a..b293428760bc 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -646,6 +646,8 @@ static int spi_geni_init(struct spi_geni_master *mas) geni_se_select_mode(se, GENI_GPI_DMA); dev_dbg(mas->dev, "Using GPI DMA mode for SPI\n"); break; + } else if (ret == -EPROBE_DEFER) { + goto out_pm; } /* * in case of failure to get gpi dma channel, we can still do the diff --git a/drivers/thermal/intel/intel_soc_dts_iosf.c b/drivers/thermal/intel/intel_soc_dts_iosf.c index f99dc7e4ae89..db97499f4f0a 100644 --- a/drivers/thermal/intel/intel_soc_dts_iosf.c +++ b/drivers/thermal/intel/intel_soc_dts_iosf.c @@ -398,7 +398,7 @@ struct intel_soc_dts_sensors *intel_soc_dts_iosf_init( spin_lock_init(&sensors->intr_notify_lock); mutex_init(&sensors->dts_update_lock); sensors->intr_type = intr_type; - sensors->tj_max = tj_max; + sensors->tj_max = tj_max * 1000; if (intr_type == INTEL_SOC_DTS_INTERRUPT_NONE) notification = false; else diff --git a/fs/afs/write.c b/fs/afs/write.c index c822d6006033..8750b99c3f56 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -731,6 +731,7 @@ static int afs_writepages_region(struct address_space *mapping, * (changing page->mapping to NULL), or even swizzled * back from swapper_space to tmpfs file mapping */ +try_again: if (wbc->sync_mode != WB_SYNC_NONE) { ret = folio_lock_killable(folio); if (ret < 0) { @@ -757,12 +758,14 @@ static int afs_writepages_region(struct address_space *mapping, #ifdef CONFIG_AFS_FSCACHE folio_wait_fscache(folio); #endif - } else { - start += folio_size(folio); + goto try_again; } + + start += folio_size(folio); if (wbc->sync_mode == WB_SYNC_NONE) { if (skips >= 5 || need_resched()) { *_next = start; + folio_batch_release(&fbatch); _leave(" = 0 [%llx]", *_next); return 0; } @@ -530,7 +530,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_mapping, - i, GFP_HIGHUSER | __GFP_ZERO); + i, GFP_USER | __GFP_ZERO); if (!page) break; pr_debug("pid(%d) page[%d]->count=%d\n", @@ -571,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ~0U; ring->head = ring->tail = 0; @@ -579,7 +579,6 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); return 0; @@ -682,9 +681,8 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) * we are protected from page migration * changes ring_pages by ->ring_lock. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->id = ctx->id; - kunmap_atomic(ring); return 0; } @@ -1025,9 +1023,8 @@ static void user_refill_reqs_available(struct kioctx *ctx) * against ctx->completed_events below will make sure we do the * safe/right thing. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; - kunmap_atomic(ring); refill_reqs_available(ctx, head, ctx->tail); } @@ -1133,12 +1130,11 @@ static void aio_complete(struct aio_kiocb *iocb) if (++tail >= ctx->nr_events) tail = 0; - ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; *event = iocb->ki_res; - kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, @@ -1152,10 +1148,9 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->tail = tail; - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; ring->tail = tail; - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); ctx->completed_events++; @@ -1215,10 +1210,9 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); /* Access to ->ring_pages here is protected by ctx->ring_lock. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; - kunmap_atomic(ring); /* * Ensure that once we've read the current tail pointer, that @@ -1250,10 +1244,9 @@ static long aio_read_events_ring(struct kioctx *ctx, avail = min(avail, nr - ret); avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); - ev = kmap(page); + ev = page_address(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); - kunmap(page); if (unlikely(copy_ret)) { ret = -EFAULT; @@ -1265,9 +1258,8 @@ static long aio_read_events_ring(struct kioctx *ctx, head %= ctx->nr_events; } - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->head = head; - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 6baf90b08e0e..93046c9dc461 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); return 0; } @@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); @@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); return 0; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6753524b146c..48ae509f2ac2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2006,7 +2006,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); if (!buf) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2c7fdbb60314..4cae41bd6de0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1255,7 +1255,7 @@ static int get_raid56_logic_offset(u64 physical, int num, u32 stripe_index; u32 rot; - *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT); + *offset = last_offset + btrfs_stripe_nr_to_offset(i); stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; @@ -1270,7 +1270,7 @@ static int get_raid56_logic_offset(u64 physical, int num, if (stripe_index < num) j++; } - *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT); + *offset = last_offset + btrfs_stripe_nr_to_offset(j); return 1; } @@ -1666,7 +1666,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - nr_stripes << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(nr_stripes)); for (int i = 0; i < nr_stripes; i++) { stripe = &sctx->stripes[i]; scrub_submit_initial_read(sctx, stripe); @@ -1789,7 +1789,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bool all_empty = true; const int data_stripes = nr_data_stripes(map); unsigned long extent_bitmap = 0; - u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT; + u64 length = btrfs_stripe_nr_to_offset(data_stripes); int ret; ASSERT(sctx->raid56_data_stripes); @@ -1804,13 +1804,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; stripe_index = (i + rot) % map->num_stripes; physical = map->stripes[stripe_index].physical + - (rot << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(rot); scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); ret = scrub_find_fill_first_stripe(bg, map->stripes[stripe_index].dev, physical, 1, - full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT), + full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); if (ret < 0) goto out; @@ -1820,7 +1820,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, */ if (ret > 0) { stripe->logical = full_stripe_start + - (i << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(i); stripe->dev = map->stripes[stripe_index].dev; stripe->mirror_num = 1; set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); @@ -2020,7 +2020,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); - return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; + return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); } /* Get the logical bytenr for the stripe */ @@ -2036,7 +2036,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, * (stripe_index / sub_stripes) gives how many data stripes we need to * skip. */ - return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) + + return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + bg->start; } @@ -2162,7 +2162,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); - offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; + offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); goto out; } @@ -2177,7 +2177,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Initialize @offset in case we need to go to out: label */ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); - increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * Due to the rotation, for RAID56 it's better to iterate each stripe diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 351ba9e90675..038dfa8f1788 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -857,10 +857,10 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, * * Thus it should be a good way to catch obvious bitflips. */ - if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) { + if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { chunk_err(leaf, chunk, logical, "chunk length too large: have %llu limit %llu", - length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT); + length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8540af6e136..4193ace3fb5a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5135,7 +5135,7 @@ static void init_alloc_chunk_ctl_policy_regular( /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); - ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT; + ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes); } static void init_alloc_chunk_ctl_policy_zoned( @@ -5811,7 +5811,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); free_extent_map(em); } return len; @@ -5985,12 +5985,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr); stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; - stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - + stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) - (offset + length); /* * after this, stripe_nr is the number of stripes on this @@ -6033,12 +6033,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT; + stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev); if (i / sub_stripes < remaining_stripes) stripes[i].length += BTRFS_STRIPE_LEN; @@ -6186,8 +6186,8 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, ASSERT(*stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = nr_data_stripes(map) << - BTRFS_STRIPE_LEN_SHIFT; + unsigned long full_stripe_len = + btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * For full stripe start, we use previously calculated @@ -6199,9 +6199,11 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * not ensured to be power of 2. */ *full_stripe_start = - rounddown(*stripe_nr, nr_data_stripes(map)) << - BTRFS_STRIPE_LEN_SHIFT; + btrfs_stripe_nr_to_offset( + rounddown(*stripe_nr, nr_data_stripes(map))); + ASSERT(*full_stripe_start + full_stripe_len > offset); + ASSERT(*full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. @@ -6224,7 +6226,7 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * { dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + - stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); } int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -6345,7 +6347,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* Return the length to the full stripe end */ *length = min(logical + *length, raid56_full_stripe_start + em->start + - (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical; + btrfs_stripe_nr_to_offset(data_stripes)) - + logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6435,7 +6438,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * modulo, to reduce one modulo call. */ bioc->full_stripe_logical = em->start + - ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); for (i = 0; i < num_stripes; i++) set_io_stripe(&bioc->stripes[i], map, (i + stripe_nr) % num_stripes, @@ -8015,7 +8018,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc, for (i = 0; i < data_stripes; i++) { u64 stripe_start = bioc->full_stripe_logical + - (i << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(i); if (logical >= stripe_start && logical < stripe_start + BTRFS_STRIPE_LEN) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3930ee01d696..d44cb9d22d63 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -580,6 +580,17 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } +/* + * Do the type safe converstion from stripe_nr to offset inside the chunk. + * + * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger + * than 4G. This does the proper type cast to avoid overflow. + */ +static inline u64 btrfs_stripe_nr_to_offset(u32 stripe_nr) +{ + return (u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT; +} + void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, diff --git a/fs/buffer.c b/fs/buffer.c index a7fc561758b1..fe64356e89b8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -111,7 +111,6 @@ void buffer_check_dirty_writeback(struct folio *folio, bh = bh->b_this_page; } while (bh != head); } -EXPORT_SYMBOL(buffer_check_dirty_writeback); /* * Block until a buffer comes unlocked. This doesn't stop it diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 82219a8f6084..d9d22d0ec38a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -451,9 +451,10 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) { - file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG, - O_RDWR | O_LARGEFILE | O_DIRECT, - cache->cache_cred); + file = kernel_tmpfile_open(&nop_mnt_idmap, &parentpath, + S_IFREG | 0600, + O_RDWR | O_LARGEFILE | O_DIRECT, + cache->cache_cred); ret = PTR_ERR_OR_ZERO(file); } if (ret) { @@ -560,8 +561,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, */ path.mnt = cache->mnt; path.dentry = dentry; - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(dentry), cache->cache_cred); + file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_backing_inode(dentry), cache->cache_cred); if (IS_ERR(file)) { trace_cachefiles_vfs_error(object, d_backing_inode(dentry), PTR_ERR(file), diff --git a/fs/char_dev.c b/fs/char_dev.c index 13deb45f1ec6..950b6919fb87 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -150,7 +150,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; - strlcpy(cd->name, name, sizeof(cd->name)); + strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; diff --git a/fs/coredump.c b/fs/coredump.c index 88740c51b942..9d235fa14ab9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -648,7 +648,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } else { struct mnt_idmap *idmap; struct inode *inode; - int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | + int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL; if (cprm.limit < binfmt->min_coredump) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 7ab5a7b7eef8..2d63da48635a 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -171,7 +171,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) */ struct fscrypt_symlink_data { __le16 len; - char encrypted_path[1]; + char encrypted_path[]; } __packed; /** diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 9e786ae66a13..6238dbcadcad 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -255,10 +255,10 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target, * for now since filesystems will assume it is there and subtract it. */ if (!__fscrypt_fname_encrypted_size(policy, len, - max_len - sizeof(struct fscrypt_symlink_data), + max_len - sizeof(struct fscrypt_symlink_data) - 1, &disk_link->len)) return -ENAMETOOLONG; - disk_link->len += sizeof(struct fscrypt_symlink_data); + disk_link->len += sizeof(struct fscrypt_symlink_data) + 1; disk_link->name = NULL; return 0; @@ -289,7 +289,7 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, if (!sd) return -ENOMEM; } - ciphertext_len = disk_link->len - sizeof(*sd); + ciphertext_len = disk_link->len - sizeof(*sd) - 1; sd->len = cpu_to_le16(ciphertext_len); err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, @@ -367,7 +367,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, * the ciphertext length, even though this is redundant with i_size. */ - if (max_size < sizeof(*sd)) + if (max_size < sizeof(*sd) + 1) return ERR_PTR(-EUCLEAN); sd = caddr; cstr.name = (unsigned char *)sd->encrypted_path; @@ -376,7 +376,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, if (cstr.len == 0) return ERR_PTR(-EUCLEAN); - if (cstr.len + sizeof(*sd) - 1 > max_size) + if (cstr.len + sizeof(*sd) > max_size) return ERR_PTR(-EUCLEAN); err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); diff --git a/fs/d_path.c b/fs/d_path.c index 56a6ee4c6331..5f4da5c8d5db 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" +#include "internal.h" struct prepend_buffer { char *buf; diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 26fa170090b8..b1b846504027 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -89,8 +89,7 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); +extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 7021e2cf6146..2a29943fa5cc 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -363,7 +363,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, return 0; } -static struct z_erofs_decompressor decompressors[] = { +const struct z_erofs_decompressor erofs_decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { .decompress = z_erofs_transform_plain, .name = "shifted" @@ -383,9 +383,3 @@ static struct z_erofs_decompressor decompressors[] = { }, #endif }; - -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) -{ - return decompressors[rq->alg].decompress(rq, pagepool); -} diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1e39c03357d1..36e32fa542f0 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -208,46 +208,12 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) - /* basic unit of the workstation of a super_block */ struct erofs_workgroup { - /* the workgroup index in the workstation */ pgoff_t index; - - /* overall workgroup reference count */ - atomic_t refcount; + struct lockref lockref; }; -static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, - int val) -{ - preempt_disable(); - if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) { - preempt_enable(); - return false; - } - return true; -} - -static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, - int orig_val) -{ - /* - * other observers should notice all modifications - * in the freezing period. - */ - smp_mb(); - atomic_set(&grp->refcount, orig_val); - preempt_enable(); -} - -static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) -{ - return atomic_cond_read_relaxed(&grp->refcount, - VAL != EROFS_LOCKED_MAGIC); -} - enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ EROFS_KMAP, /* use kmap_local_page() to map the buffer */ @@ -486,7 +452,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP -int erofs_workgroup_put(struct erofs_workgroup *grp); +void erofs_workgroup_put(struct erofs_workgroup *grp); struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, pgoff_t index); struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, @@ -500,7 +466,6 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); @@ -511,6 +476,7 @@ void erofs_put_pcpubuf(void *ptr); int erofs_pcpubuf_growsize(unsigned int nrpages); void __init erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); +int erofs_init_managed_cache(struct super_block *sb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -530,6 +496,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } static inline void erofs_pcpubuf_init(void) {} static inline void erofs_pcpubuf_exit(void) {} +static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 811ab66d805e..ff18f6b14de5 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -599,68 +599,6 @@ static int erofs_fc_parse_param(struct fs_context *fc, return 0; } -#ifdef CONFIG_EROFS_FS_ZIP -static const struct address_space_operations managed_cache_aops; - -static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp) -{ - bool ret = true; - struct address_space *const mapping = folio->mapping; - - DBG_BUGON(!folio_test_locked(folio)); - DBG_BUGON(mapping->a_ops != &managed_cache_aops); - - if (folio_test_private(folio)) - ret = erofs_try_to_free_cached_page(&folio->page); - - return ret; -} - -/* - * It will be called only on inode eviction. In case that there are still some - * decompression requests in progress, wait with rescheduling for a bit here. - * We could introduce an extra locking instead but it seems unnecessary. - */ -static void erofs_managed_cache_invalidate_folio(struct folio *folio, - size_t offset, size_t length) -{ - const size_t stop = length + offset; - - DBG_BUGON(!folio_test_locked(folio)); - - /* Check for potential overflow in debug mode */ - DBG_BUGON(stop > folio_size(folio) || stop < length); - - if (offset == 0 && stop == folio_size(folio)) - while (!erofs_managed_cache_release_folio(folio, GFP_NOFS)) - cond_resched(); -} - -static const struct address_space_operations managed_cache_aops = { - .release_folio = erofs_managed_cache_release_folio, - .invalidate_folio = erofs_managed_cache_invalidate_folio, -}; - -static int erofs_init_managed_cache(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct inode *const inode = new_inode(sb); - - if (!inode) - return -ENOMEM; - - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - - inode->i_mapping->a_ops = &managed_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - sbi->managed_cache = inode; - return 0; -} -#else -static int erofs_init_managed_cache(struct super_block *sb) { return 0; } -#endif - static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -1016,10 +954,8 @@ static int __init erofs_module_init(void) sizeof(struct erofs_inode), 0, SLAB_RECLAIM_ACCOUNT, erofs_inode_init_once); - if (!erofs_inode_cachep) { - err = -ENOMEM; - goto icache_err; - } + if (!erofs_inode_cachep) + return -ENOMEM; err = erofs_init_shrinker(); if (err) @@ -1054,7 +990,6 @@ lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); -icache_err: return err; } diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 46627cb69abe..cc6fb9e98899 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -4,7 +4,6 @@ * https://www.huawei.com/ */ #include "internal.h" -#include <linux/pagevec.h> struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { @@ -33,22 +32,21 @@ void erofs_release_pages(struct page **pagepool) /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; -static int erofs_workgroup_get(struct erofs_workgroup *grp) +static bool erofs_workgroup_get(struct erofs_workgroup *grp) { - int o; + if (lockref_get_not_zero(&grp->lockref)) + return true; -repeat: - o = erofs_wait_on_workgroup_freezed(grp); - if (o <= 0) - return -1; - - if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o) - goto repeat; + spin_lock(&grp->lockref.lock); + if (__lockref_is_dead(&grp->lockref)) { + spin_unlock(&grp->lockref.lock); + return false; + } - /* decrease refcount paired by erofs_workgroup_put */ - if (o == 1) + if (!grp->lockref.count++) atomic_long_dec(&erofs_global_shrink_cnt); - return 0; + spin_unlock(&grp->lockref.lock); + return true; } struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, @@ -61,7 +59,7 @@ repeat: rcu_read_lock(); grp = xa_load(&sbi->managed_pslots, index); if (grp) { - if (erofs_workgroup_get(grp)) { + if (!erofs_workgroup_get(grp)) { /* prefer to relax rcu read side */ rcu_read_unlock(); goto repeat; @@ -80,11 +78,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, struct erofs_workgroup *pre; /* - * Bump up a reference count before making this visible - * to others for the XArray in order to avoid potential - * UAF without serialized by xa_lock. + * Bump up before making this visible to others for the XArray in order + * to avoid potential UAF without serialized by xa_lock. */ - atomic_inc(&grp->refcount); + lockref_get(&grp->lockref); repeat: xa_lock(&sbi->managed_pslots); @@ -93,13 +90,13 @@ repeat: if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); - } else if (erofs_workgroup_get(pre)) { + } else if (!erofs_workgroup_get(pre)) { /* try to legitimize the current in-tree one */ xa_unlock(&sbi->managed_pslots); cond_resched(); goto repeat; } - atomic_dec(&grp->refcount); + lockref_put_return(&grp->lockref); grp = pre; } xa_unlock(&sbi->managed_pslots); @@ -112,38 +109,34 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp) erofs_workgroup_free_rcu(grp); } -int erofs_workgroup_put(struct erofs_workgroup *grp) +void erofs_workgroup_put(struct erofs_workgroup *grp) { - int count = atomic_dec_return(&grp->refcount); + if (lockref_put_or_lock(&grp->lockref)) + return; - if (count == 1) + DBG_BUGON(__lockref_is_dead(&grp->lockref)); + if (grp->lockref.count == 1) atomic_long_inc(&erofs_global_shrink_cnt); - else if (!count) - __erofs_workgroup_free(grp); - return count; + --grp->lockref.count; + spin_unlock(&grp->lockref.lock); } static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, struct erofs_workgroup *grp) { - /* - * If managed cache is on, refcount of workgroups - * themselves could be < 0 (freezed). In other words, - * there is no guarantee that all refcounts > 0. - */ - if (!erofs_workgroup_try_to_freeze(grp, 1)) - return false; + int free = false; + + spin_lock(&grp->lockref.lock); + if (grp->lockref.count) + goto out; /* - * Note that all cached pages should be unattached - * before deleted from the XArray. Otherwise some - * cached pages could be still attached to the orphan - * old workgroup when the new one is available in the tree. + * Note that all cached pages should be detached before deleted from + * the XArray. Otherwise some cached pages could be still attached to + * the orphan old workgroup when the new one is available in the tree. */ - if (erofs_try_to_free_all_cached_pages(sbi, grp)) { - erofs_workgroup_unfreeze(grp, 1); - return false; - } + if (erofs_try_to_free_all_cached_pages(sbi, grp)) + goto out; /* * It's impossible to fail after the workgroup is freezed, @@ -152,10 +145,13 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, */ DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); - /* last refcount should be connected with its managed pslot. */ - erofs_workgroup_unfreeze(grp, 0); - __erofs_workgroup_free(grp); - return true; + lockref_mark_dead(&grp->lockref); + free = true; +out: + spin_unlock(&grp->lockref.lock); + if (free) + __erofs_workgroup_free(grp); + return free; } static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index bbfe7ce170d2..40178b6e0688 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -7,32 +7,27 @@ #include <linux/security.h> #include "xattr.h" -static inline erofs_blk_t erofs_xattr_blkaddr(struct super_block *sb, - unsigned int xattr_id) -{ - return EROFS_SB(sb)->xattr_blkaddr + - erofs_blknr(sb, xattr_id * sizeof(__u32)); -} - -static inline unsigned int erofs_xattr_blkoff(struct super_block *sb, - unsigned int xattr_id) -{ - return erofs_blkoff(sb, xattr_id * sizeof(__u32)); -} - -struct xattr_iter { +struct erofs_xattr_iter { struct super_block *sb; struct erofs_buf buf; + erofs_off_t pos; void *kaddr; - erofs_blk_t blkaddr; - unsigned int ofs; + char *buffer; + int buffer_size, buffer_ofs; + + /* getxattr */ + int index, infix_len; + struct qstr name; + + /* listxattr */ + struct dentry *dentry; }; static int erofs_init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); - struct xattr_iter it; + struct erofs_xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; struct super_block *sb = inode->i_sb; @@ -81,17 +76,17 @@ static int erofs_init_inode_xattrs(struct inode *inode) } it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(sb, erofs_iloc(inode) + vi->inode_isize); - it.ofs = erofs_blkoff(sb, erofs_iloc(inode) + vi->inode_isize); + erofs_init_metabuf(&it.buf, sb); + it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; } - ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + ih = it.kaddr + erofs_blkoff(sb, it.pos); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); @@ -102,26 +97,20 @@ static int erofs_init_inode_xattrs(struct inode *inode) } /* let's skip ibody header */ - it.ofs += sizeof(struct erofs_xattr_ibody_header); + it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - if (it.ofs >= sb->s_blocksize) { - /* cannot be unaligned */ - DBG_BUGON(it.ofs != sb->s_blocksize); - - it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr, - EROFS_KMAP); - if (IS_ERR(it.kaddr)) { - kfree(vi->xattr_shared_xattrs); - vi->xattr_shared_xattrs = NULL; - ret = PTR_ERR(it.kaddr); - goto out_unlock; - } - it.ofs = 0; + it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), + EROFS_KMAP); + if (IS_ERR(it.kaddr)) { + kfree(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + ret = PTR_ERR(it.kaddr); + goto out_unlock; } - vi->xattr_shared_xattrs[i] = - le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); - it.ofs += sizeof(__le32); + vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *) + (it.kaddr + erofs_blkoff(sb, it.pos))); + it.pos += sizeof(__le32); } erofs_put_metabuf(&it.buf); @@ -134,287 +123,6 @@ out_unlock: return ret; } -/* - * the general idea for these return values is - * if 0 is returned, go on processing the current xattr; - * 1 (> 0) is returned, skip this round to process the next xattr; - * -err (< 0) is returned, an error (maybe ENOXATTR) occurred - * and need to be handled - */ -struct xattr_iter_handlers { - int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); - int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, - unsigned int len); - int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); - void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, - unsigned int len); -}; - -static inline int xattr_iter_fixup(struct xattr_iter *it) -{ - if (it->ofs < it->sb->s_blocksize) - return 0; - - it->blkaddr += erofs_blknr(it->sb, it->ofs); - it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, - EROFS_KMAP); - if (IS_ERR(it->kaddr)) - return PTR_ERR(it->kaddr); - it->ofs = erofs_blkoff(it->sb, it->ofs); - return 0; -} - -static int inline_xattr_iter_begin(struct xattr_iter *it, - struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - unsigned int xattr_header_sz, inline_xattr_ofs; - - xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + - sizeof(u32) * vi->xattr_shared_count; - if (xattr_header_sz >= vi->xattr_isize) { - DBG_BUGON(xattr_header_sz > vi->xattr_isize); - return -ENOATTR; - } - - inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - - it->blkaddr = erofs_blknr(it->sb, erofs_iloc(inode) + inline_xattr_ofs); - it->ofs = erofs_blkoff(it->sb, erofs_iloc(inode) + inline_xattr_ofs); - it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, - EROFS_KMAP); - if (IS_ERR(it->kaddr)) - return PTR_ERR(it->kaddr); - return vi->xattr_isize - xattr_header_sz; -} - -/* - * Regardless of success or failure, `xattr_foreach' will end up with - * `ofs' pointing to the next xattr item rather than an arbitrary position. - */ -static int xattr_foreach(struct xattr_iter *it, - const struct xattr_iter_handlers *op, - unsigned int *tlimit) -{ - struct erofs_xattr_entry entry; - unsigned int value_sz, processed, slice; - int err; - - /* 0. fixup blkaddr, ofs, ipage */ - err = xattr_iter_fixup(it); - if (err) - return err; - - /* - * 1. read xattr entry to the memory, - * since we do EROFS_XATTR_ALIGN - * therefore entry should be in the page - */ - entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); - if (tlimit) { - unsigned int entry_sz = erofs_xattr_entry_size(&entry); - - /* xattr on-disk corruption: xattr entry beyond xattr_isize */ - if (*tlimit < entry_sz) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - *tlimit -= entry_sz; - } - - it->ofs += sizeof(struct erofs_xattr_entry); - value_sz = le16_to_cpu(entry.e_value_size); - - /* handle entry */ - err = op->entry(it, &entry); - if (err) { - it->ofs += entry.e_name_len + value_sz; - goto out; - } - - /* 2. handle xattr name (ofs will finally be at the end of name) */ - processed = 0; - - while (processed < entry.e_name_len) { - if (it->ofs >= it->sb->s_blocksize) { - DBG_BUGON(it->ofs > it->sb->s_blocksize); - - err = xattr_iter_fixup(it); - if (err) - goto out; - it->ofs = 0; - } - - slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, - entry.e_name_len - processed); - - /* handle name */ - err = op->name(it, processed, it->kaddr + it->ofs, slice); - if (err) { - it->ofs += entry.e_name_len - processed + value_sz; - goto out; - } - - it->ofs += slice; - processed += slice; - } - - /* 3. handle xattr value */ - processed = 0; - - if (op->alloc_buffer) { - err = op->alloc_buffer(it, value_sz); - if (err) { - it->ofs += value_sz; - goto out; - } - } - - while (processed < value_sz) { - if (it->ofs >= it->sb->s_blocksize) { - DBG_BUGON(it->ofs > it->sb->s_blocksize); - - err = xattr_iter_fixup(it); - if (err) - goto out; - it->ofs = 0; - } - - slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, - value_sz - processed); - op->value(it, processed, it->kaddr + it->ofs, slice); - it->ofs += slice; - processed += slice; - } - -out: - /* xattrs should be 4-byte aligned (on-disk constraint) */ - it->ofs = EROFS_XATTR_ALIGN(it->ofs); - return err < 0 ? err : 0; -} - -struct getxattr_iter { - struct xattr_iter it; - - char *buffer; - int buffer_size, index, infix_len; - struct qstr name; -}; - -static int erofs_xattr_long_entrymatch(struct getxattr_iter *it, - struct erofs_xattr_entry *entry) -{ - struct erofs_sb_info *sbi = EROFS_SB(it->it.sb); - struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + - (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); - - if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) - return -ENOATTR; - - if (it->index != pf->prefix->base_index || - it->name.len != entry->e_name_len + pf->infix_len) - return -ENOATTR; - - if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) - return -ENOATTR; - - it->infix_len = pf->infix_len; - return 0; -} - -static int xattr_entrymatch(struct xattr_iter *_it, - struct erofs_xattr_entry *entry) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - /* should also match the infix for long name prefixes */ - if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) - return erofs_xattr_long_entrymatch(it, entry); - - if (it->index != entry->e_name_index || - it->name.len != entry->e_name_len) - return -ENOATTR; - it->infix_len = 0; - return 0; -} - -static int xattr_namematch(struct xattr_iter *_it, - unsigned int processed, char *buf, unsigned int len) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - if (memcmp(buf, it->name.name + it->infix_len + processed, len)) - return -ENOATTR; - return 0; -} - -static int xattr_checkbuffer(struct xattr_iter *_it, - unsigned int value_sz) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - int err = it->buffer_size < value_sz ? -ERANGE : 0; - - it->buffer_size = value_sz; - return !it->buffer ? 1 : err; -} - -static void xattr_copyvalue(struct xattr_iter *_it, - unsigned int processed, - char *buf, unsigned int len) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - memcpy(it->buffer + processed, buf, len); -} - -static const struct xattr_iter_handlers find_xattr_handlers = { - .entry = xattr_entrymatch, - .name = xattr_namematch, - .alloc_buffer = xattr_checkbuffer, - .value = xattr_copyvalue -}; - -static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) -{ - int ret; - unsigned int remaining; - - ret = inline_xattr_iter_begin(&it->it, inode); - if (ret < 0) - return ret; - - remaining = ret; - while (remaining) { - ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); - if (ret != -ENOATTR) - break; - } - return ret ? ret : it->buffer_size; -} - -static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = it->it.sb; - unsigned int i, xsid; - int ret = -ENOATTR; - - for (i = 0; i < vi->xattr_shared_count; ++i) { - xsid = vi->xattr_shared_xattrs[i]; - it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); - it->it.ofs = erofs_xattr_blkoff(sb, xsid); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, - it->it.blkaddr, EROFS_KMAP); - if (IS_ERR(it->it.kaddr)) - return PTR_ERR(it->it.kaddr); - - ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); - if (ret != -ENOATTR) - break; - } - return ret ? ret : it->buffer_size; -} - static bool erofs_xattr_user_list(struct dentry *dentry) { return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); @@ -425,39 +133,6 @@ static bool erofs_xattr_trusted_list(struct dentry *dentry) return capable(CAP_SYS_ADMIN); } -int erofs_getxattr(struct inode *inode, int index, - const char *name, - void *buffer, size_t buffer_size) -{ - int ret; - struct getxattr_iter it; - - if (!name) - return -EINVAL; - - ret = erofs_init_inode_xattrs(inode); - if (ret) - return ret; - - it.index = index; - it.name.len = strlen(name); - if (it.name.len > EROFS_NAME_LEN) - return -ERANGE; - - it.it.buf = __EROFS_BUF_INITIALIZER; - it.name.name = name; - - it.buffer = buffer; - it.buffer_size = buffer_size; - - it.it.sb = inode->i_sb; - ret = inline_getxattr(inode, &it); - if (ret == -ENOATTR) - ret = shared_getxattr(inode, &it); - erofs_put_metabuf(&it.it.buf); - return ret; -} - static int erofs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) @@ -500,30 +175,49 @@ const struct xattr_handler *erofs_xattr_handlers[] = { NULL, }; -struct listxattr_iter { - struct xattr_iter it; - - struct dentry *dentry; - char *buffer; - int buffer_size, buffer_ofs; -}; +static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, + unsigned int len) +{ + unsigned int slice, processed; + struct super_block *sb = it->sb; + void *src; + + for (processed = 0; processed < len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + src = it->kaddr + erofs_blkoff(sb, it->pos); + slice = min_t(unsigned int, sb->s_blocksize - + erofs_blkoff(sb, it->pos), len - processed); + memcpy(it->buffer + it->buffer_ofs, src, slice); + it->buffer_ofs += slice; + it->pos += slice; + } + return 0; +} -static int xattr_entrylist(struct xattr_iter *_it, - struct erofs_xattr_entry *entry) +static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) { - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); - unsigned int base_index = entry->e_name_index; - unsigned int prefix_len, infix_len = 0; + struct erofs_xattr_entry entry; + unsigned int base_index, name_total, prefix_len, infix_len = 0; const char *prefix, *infix = NULL; + int err; + + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *) + (it->kaddr + erofs_blkoff(it->sb, it->pos)); + it->pos += sizeof(struct erofs_xattr_entry); - if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) { - struct erofs_sb_info *sbi = EROFS_SB(_it->sb); + base_index = entry.e_name_index; + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(it->sb); struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + - (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) - return 1; + return 0; infix = pf->prefix->infix; infix_len = pf->infix_len; base_index = pf->prefix->base_index; @@ -531,120 +225,228 @@ static int xattr_entrylist(struct xattr_iter *_it, prefix = erofs_xattr_prefix(base_index, it->dentry); if (!prefix) - return 1; + return 0; prefix_len = strlen(prefix); + name_total = prefix_len + infix_len + entry.e_name_len + 1; if (!it->buffer) { - it->buffer_ofs += prefix_len + infix_len + - entry->e_name_len + 1; - return 1; + it->buffer_ofs += name_total; + return 0; } - if (it->buffer_ofs + prefix_len + infix_len + - + entry->e_name_len + 1 > it->buffer_size) + if (it->buffer_ofs + name_total > it->buffer_size) return -ERANGE; memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len); it->buffer_ofs += prefix_len + infix_len; - return 0; -} -static int xattr_namelist(struct xattr_iter *_it, - unsigned int processed, char *buf, unsigned int len) -{ - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); + /* 2. handle xattr name */ + err = erofs_xattr_copy_to_buffer(it, entry.e_name_len); + if (err) + return err; - memcpy(it->buffer + it->buffer_ofs, buf, len); - it->buffer_ofs += len; + it->buffer[it->buffer_ofs++] = '\0'; return 0; } -static int xattr_skipvalue(struct xattr_iter *_it, - unsigned int value_sz) +static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) { - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); + struct super_block *sb = it->sb; + struct erofs_xattr_entry entry; + unsigned int slice, processed, value_sz; - it->buffer[it->buffer_ofs++] = '\0'; - return 1; -} + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *) + (it->kaddr + erofs_blkoff(sb, it->pos)); + it->pos += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); -static const struct xattr_iter_handlers list_xattr_handlers = { - .entry = xattr_entrylist, - .name = xattr_namelist, - .alloc_buffer = xattr_skipvalue, - .value = NULL -}; + /* should also match the infix for long name prefixes */ + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return -ENOATTR; + + if (it->index != pf->prefix->base_index || + it->name.len != entry.e_name_len + pf->infix_len) + return -ENOATTR; + + if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) + return -ENOATTR; + + it->infix_len = pf->infix_len; + } else { + if (it->index != entry.e_name_index || + it->name.len != entry.e_name_len) + return -ENOATTR; -static int inline_listxattr(struct listxattr_iter *it) + it->infix_len = 0; + } + + /* 2. handle xattr name */ + for (processed = 0; processed < entry.e_name_len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + slice = min_t(unsigned int, + sb->s_blocksize - erofs_blkoff(sb, it->pos), + entry.e_name_len - processed); + if (memcmp(it->name.name + it->infix_len + processed, + it->kaddr + erofs_blkoff(sb, it->pos), slice)) + return -ENOATTR; + it->pos += slice; + } + + /* 3. handle xattr value */ + if (!it->buffer) { + it->buffer_ofs = value_sz; + return 0; + } + + if (it->buffer_size < value_sz) + return -ERANGE; + + return erofs_xattr_copy_to_buffer(it, value_sz); +} + +static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) { + struct erofs_inode *const vi = EROFS_I(inode); + unsigned int xattr_header_sz, remaining, entry_sz; + erofs_off_t next_pos; int ret; - unsigned int remaining; - ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); - if (ret < 0) - return ret; + xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + remaining = vi->xattr_isize - xattr_header_sz; + it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; - remaining = ret; while (remaining) { - ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); - if (ret) + it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + entry_sz = erofs_xattr_entry_size(it->kaddr + + erofs_blkoff(it->sb, it->pos)); + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (remaining < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + remaining -= entry_sz; + next_pos = it->pos + entry_sz; + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) break; + + it->pos = next_pos; } - return ret ? ret : it->buffer_ofs; + return ret; } -static int shared_listxattr(struct listxattr_iter *it) +static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) { - struct inode *const inode = d_inode(it->dentry); struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = it->it.sb; - unsigned int i, xsid; - int ret = 0; + struct super_block *const sb = it->sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int i; + int ret = -ENOATTR; for (i = 0; i < vi->xattr_shared_count; ++i) { - xsid = vi->xattr_shared_xattrs[i]; - it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); - it->it.ofs = erofs_xattr_blkoff(sb, xsid); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, - it->it.blkaddr, EROFS_KMAP); - if (IS_ERR(it->it.kaddr)) - return PTR_ERR(it->it.kaddr); - - ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); - if (ret) + it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + + vi->xattr_shared_xattrs[i] * sizeof(__le32); + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) break; } - return ret ? ret : it->buffer_ofs; + return ret; +} + +int erofs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + struct erofs_xattr_iter it; + + if (!name) + return -EINVAL; + + ret = erofs_init_inode_xattrs(inode); + if (ret) + return ret; + + it.index = index; + it.name = (struct qstr)QSTR_INIT(name, strlen(name)); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + + it.sb = inode->i_sb; + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, it.sb); + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + ret = erofs_xattr_iter_inline(&it, inode, true); + if (ret == -ENOATTR) + ret = erofs_xattr_iter_shared(&it, inode, true); + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; } -ssize_t erofs_listxattr(struct dentry *dentry, - char *buffer, size_t buffer_size) +ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret; - struct listxattr_iter it; + struct erofs_xattr_iter it; + struct inode *inode = d_inode(dentry); - ret = erofs_init_inode_xattrs(d_inode(dentry)); + ret = erofs_init_inode_xattrs(inode); if (ret == -ENOATTR) return 0; if (ret) return ret; - it.it.buf = __EROFS_BUF_INITIALIZER; + it.sb = dentry->d_sb; + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, it.sb); it.dentry = dentry; it.buffer = buffer; it.buffer_size = buffer_size; it.buffer_ofs = 0; - it.it.sb = dentry->d_sb; - - ret = inline_listxattr(&it); - if (ret >= 0 || ret == -ENOATTR) - ret = shared_listxattr(&it); - erofs_put_metabuf(&it.it.buf); - return ret; + ret = erofs_xattr_iter_inline(&it, inode, false); + if (!ret || ret == -ENOATTR) + ret = erofs_xattr_iter_shared(&it, inode, false); + if (ret == -ENOATTR) + ret = 0; + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; } void erofs_xattr_prefixes_cleanup(struct super_block *sb) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 160b3da43aec..5f1890e309c6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -5,7 +5,6 @@ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" -#include <linux/prefetch.h> #include <linux/psi.h> #include <linux/cpuhotplug.h> #include <trace/events/erofs.h> @@ -92,13 +91,8 @@ struct z_erofs_pcluster { struct z_erofs_bvec compressed_bvecs[]; }; -/* let's avoid the valid 32-bit kernel addresses */ - -/* the chained workgroup has't submitted io (still open) */ -#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) -/* the chained workgroup has already submitted io */ -#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) - +/* the end of a chain of pclusters */ +#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) #define Z_EROFS_PCLUSTER_NIL (NULL) struct z_erofs_decompressqueue { @@ -241,14 +235,20 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, - struct page **candidate_bvpage) + struct page **candidate_bvpage, + struct page **pagepool) { - if (iter->cur == iter->nr) { - if (!*candidate_bvpage) - return -EAGAIN; - + if (iter->cur >= iter->nr) { + struct page *nextpage = *candidate_bvpage; + + if (!nextpage) { + nextpage = erofs_allocpage(pagepool, GFP_NOFS); + if (!nextpage) + return -ENOMEM; + set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); + } DBG_BUGON(iter->bvset->nextpage); - iter->bvset->nextpage = *candidate_bvpage; + iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; @@ -500,20 +500,6 @@ out_error_pcluster_pool: enum z_erofs_pclustermode { Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current pclusters was the tail of an exist chain, in addition - * that the previous processed chained pclusters are all decided to - * be hooked up to it. - * A new chain will be created for the remaining pclusters which are - * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, - * the next pcluster cannot reuse the whole page safely for inplace I/O - * in the following scenario: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (belongs to the next pcl) | (belongs to the current pcl) | - * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| - */ - Z_EROFS_PCLUSTER_HOOKED, - /* * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or @@ -530,8 +516,8 @@ enum z_erofs_pclustermode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PCLUSTER_FOLLOWED or | | - * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| + * | | | + * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ @@ -543,12 +529,12 @@ struct z_erofs_decompress_frontend { struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; + struct page *pagepool; struct page *candidate_bvpage; - struct z_erofs_pcluster *pcl, *tailpcl; + struct z_erofs_pcluster *pcl; z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -578,8 +564,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - struct page **pagepool) +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; @@ -620,7 +605,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * succeeds or fallback to in-place I/O instead * to avoid any direct reclaim. */ - newpage = erofs_allocpage(pagepool, gfp); + newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); @@ -633,7 +618,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (page) put_page(page); else if (newpage) - erofs_pagepool_add(pagepool, newpage); + erofs_pagepool_add(&fe->pagepool, newpage); } /* @@ -654,7 +639,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); /* - * refcount of workgroup is now freezed as 1, + * refcount of workgroup is now freezed as 0, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { @@ -678,29 +663,73 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct page *page) +static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { - struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret, i; + struct z_erofs_pcluster *pcl = folio_get_private(folio); + bool ret; + int i; - if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) - return 0; + if (!folio_test_private(folio)) + return true; + + ret = false; + spin_lock(&pcl->obj.lockref.lock); + if (pcl->obj.lockref.count > 0) + goto out; - ret = 0; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == page) { + if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = 1; + ret = true; break; } } - erofs_workgroup_unfreeze(&pcl->obj, 1); if (ret) - detach_page_private(page); + folio_detach_private(folio); +out: + spin_unlock(&pcl->obj.lockref.lock); return ret; } +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * An extra lock could be introduced instead but it seems unnecessary. + */ +static void z_erofs_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + const size_t stop = length + offset; + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > folio_size(folio) || stop < length); + + if (offset == 0 && stop == folio_size(folio)) + while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations z_erofs_cache_aops = { + .release_folio = z_erofs_cache_release_folio, + .invalidate_folio = z_erofs_cache_invalidate_folio, +}; + +int erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &z_erofs_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + EROFS_SB(sb)->managed_cache = inode; + return 0; +} + static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { @@ -731,7 +760,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } - ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, + &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } @@ -750,19 +780,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) return; } - /* - * type 2, link to the end of an existing open chain, be careful - * that its submission is controlled by the original attached chain. - */ - if (*owned_head != &pcl->next && pcl != f->tailpcl && - cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - *owned_head) == Z_EROFS_PCLUSTER_TAIL) { - *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = Z_EROFS_PCLUSTER_HOOKED; - f->tailpcl = NULL; - return; - } - /* type 3, it belongs to a chain, but it isn't the end of the chain */ + /* type 2, it belongs to an ongoing chain */ f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } @@ -786,7 +804,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - atomic_set(&pcl->obj.refcount, 1); + spin_lock_init(&pcl->obj.lockref.lock); pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; @@ -823,9 +841,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) goto err_out; } } - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; fe->owned_head = &pcl->next; fe->pcl = pcl; return 0; @@ -846,7 +861,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); - DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); if (!(map->m_flags & EROFS_MAP_META)) { grp = erofs_find_workgroup(fe->inode->i_sb, @@ -865,10 +879,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = fe->pcl; - z_erofs_try_to_claim_pcluster(fe); } else if (ret) { return ret; @@ -908,10 +918,8 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); - if (fe->candidate_bvpage) { - DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - } /* * if all pending pages are added, don't hold its reference @@ -958,7 +966,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct page **pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -1016,7 +1024,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe, pagepool); + z_erofs_bind_cache(fe); } hitted: /* @@ -1025,8 +1033,7 @@ hitted: * those chains are handled asynchronously thus the page cannot be used * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && - fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -1056,24 +1063,13 @@ hitted: if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); -retry: err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { .page = page, .offset = offset - map->m_la, .end = end, }), exclusive); - /* should allocate an additional short-lived page for bvset */ - if (err == -EAGAIN && !fe->candidate_bvpage) { - fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); - set_page_private(fe->candidate_bvpage, - Z_EROFS_SHORTLIVED_PAGE); - goto retry; - } - - if (err) { - DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + if (err) goto out; - } z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ @@ -1104,7 +1100,7 @@ out: return err; } -static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, +static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ @@ -1283,6 +1279,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + const struct z_erofs_decompressor *decompressor = + &erofs_decompressors[pcl->algorithmformat]; unsigned int i, inputsize; int err2; struct page *page; @@ -1326,7 +1324,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, else inputsize = pclusterpages * PAGE_SIZE; - err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, @@ -1404,10 +1402,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, }; z_erofs_next_pcluster_t owned = io->head; - while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ - DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ + while (owned != Z_EROFS_PCLUSTER_TAIL) { DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); be.pcl = container_of(owned, struct z_erofs_pcluster, next); @@ -1424,7 +1419,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) container_of(work, struct z_erofs_decompressqueue, u.work); struct page *pagepool = NULL; - DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); z_erofs_decompress_queue(bgq, &pagepool); erofs_release_pages(&pagepool); kvfree(bgq); @@ -1452,7 +1447,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; /* Use (kthread_)work and sync decompression for atomic contexts only */ - if (in_atomic() || irqs_disabled()) { + if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD struct kthread_worker *worker; @@ -1612,7 +1607,7 @@ fg_out: q->sync = true; } q->sb = sb; - q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + q->head = Z_EROFS_PCLUSTER_TAIL; return q; } @@ -1630,11 +1625,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (owned_head == Z_EROFS_PCLUSTER_TAIL) - owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; - - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); WRITE_ONCE(*submit_qtail, owned_head); WRITE_ONCE(*bypass_qtail, &pcl->next); @@ -1668,9 +1659,8 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, struct z_erofs_decompressqueue *fgq, - bool *force_fg) + bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); @@ -1705,15 +1695,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, unsigned int i = 0; bool bypass = true; - /* no possible 'owned_head' equals the following */ - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned_head, struct z_erofs_pcluster, next); + owned_head = READ_ONCE(pcl->next); - /* close the main owned chain at first */ - owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - Z_EROFS_PCLUSTER_TAIL_CLOSED); if (z_erofs_is_inline_pcluster(pcl)) { move_to_bypass_jobqueue(pcl, qtail, owned_head); continue; @@ -1731,8 +1716,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct page *page; - page = pickup_page_for_submission(pcl, i++, pagepool, - mc); + page = pickup_page_for_submission(pcl, i++, + &f->pagepool, mc); if (!page) continue; @@ -1761,7 +1746,7 @@ submit_bio_retry: bio->bi_iter.bi_sector = (sector_t)cur << (sb->s_blocksize_bits - 9); bio->bi_private = q[JQ_SUBMIT]; - if (f->readahead) + if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } @@ -1797,16 +1782,16 @@ submit_bio_retry: } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg) + bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); + z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) return; @@ -1815,7 +1800,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); + z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); } /* @@ -1823,29 +1808,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, * approximate readmore strategies as a start. */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, - struct readahead_control *rac, - erofs_off_t end, - struct page **pagepool, - bool backmost) + struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; - erofs_off_t cur; + erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { + if (rac) + end = headoffset + readahead_length(rac) - 1; + else + end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); if (err) return; - /* expend ra for the trailing edge if readahead */ + /* expand ra for the trailing edge if readahead */ if (rac) { - loff_t newstart = readahead_pos(rac); - cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); - readahead_expand(rac, newstart, cur - newstart); + readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); @@ -1866,7 +1850,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) { unlock_page(page); } else { - err = z_erofs_do_read_page(f, page, pagepool); + err = z_erofs_do_read_page(f, page); if (err) erofs_err(inode->i_sb, "readmore error at page %lu @ nid %llu", @@ -1887,28 +1871,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) struct inode *const inode = page->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL; int err; trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, - &pagepool, true); - err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); - + z_erofs_pcluster_readmore(&f, NULL, true); + err = z_erofs_do_read_page(&f, page); + z_erofs_pcluster_readmore(&f, NULL, false); (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, 0)); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); return err; } @@ -1917,14 +1897,12 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL, *head = NULL, *page; + struct page *head = NULL, *page; unsigned int nr_pages; - f.readahead = true; f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, f.headoffset + - readahead_length(rac) - 1, &pagepool, true); + z_erofs_pcluster_readmore(&f, rac, true); nr_pages = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); @@ -1940,20 +1918,19 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", page->index, EROFS_I(inode)->nid); put_page(page); } - z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); + z_erofs_pcluster_readmore(&f, rac, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index d37c5c89c728..1909ddafd9c7 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -22,8 +22,8 @@ struct z_erofs_maprecorder { bool partialref; }; -static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) +static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); @@ -129,7 +129,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, u8 *in, type; bool big_pcluster; - if (1 << amortizedshift == 4) + if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; else if (1 << amortizedshift == 2 && lclusterbits == 12) vcnt = 16; @@ -226,12 +226,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, return 0; } -static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn, bool lookahead) +static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn, bool lookahead) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); unsigned int totalidx = erofs_iblks(inode); @@ -239,9 +238,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int amortizedshift; erofs_off_t pos; - if (lclusterbits != 12) - return -EOPNOTSUPP; - if (lcn >= totalidx) return -EINVAL; @@ -281,23 +277,23 @@ out: return unpack_compacted_index(m, amortizedshift, pos, lookahead); } -static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn, bool lookahead) +static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn, bool lookahead) { - const unsigned int datamode = EROFS_I(m->inode)->datalayout; - - if (datamode == EROFS_INODE_COMPRESSED_FULL) - return legacy_load_cluster_from_disk(m, lcn); - - if (datamode == EROFS_INODE_COMPRESSED_COMPACT) - return compacted_load_cluster_from_disk(m, lcn, lookahead); - - return -EINVAL; + switch (EROFS_I(m->inode)->datalayout) { + case EROFS_INODE_COMPRESSED_FULL: + return z_erofs_load_full_lcluster(m, lcn); + case EROFS_INODE_COMPRESSED_COMPACT: + return z_erofs_load_compact_lcluster(m, lcn, lookahead); + default: + return -EINVAL; + } } static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned int lookback_distance) { + struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; @@ -305,21 +301,15 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned long lcn = m->lcn - lookback_distance; int err; - /* load extent head logical cluster if needed */ - err = z_erofs_load_cluster_from_disk(m, lcn, false); + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; switch (m->type) { case Z_EROFS_LCLUSTER_TYPE_NONHEAD: - if (!m->delta[0]) { - erofs_err(m->inode->i_sb, - "invalid lookback distance 0 @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } lookback_distance = m->delta[0]; + if (!lookback_distance) + goto err_bogus; continue; case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: @@ -328,16 +318,15 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; default: - erofs_err(m->inode->i_sb, - "unknown type %u @ lcn %lu of nid %llu", + erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", m->type, lcn, vi->nid); DBG_BUGON(1); return -EOPNOTSUPP; } } - - erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu", - vi->nid); +err_bogus: + erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", + lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -369,7 +358,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if (m->compressedblks) goto out; - err = z_erofs_load_cluster_from_disk(m, lcn, false); + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; @@ -401,9 +390,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, break; fallthrough; default: - erofs_err(m->inode->i_sb, - "cannot found CBLKCNT @ lcn %lu of nid %llu", - lcn, vi->nid); + erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, + vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -411,9 +399,7 @@ out: map->m_plen = erofs_pos(sb, m->compressedblks); return 0; err_bonus_cblkcnt: - erofs_err(m->inode->i_sb, - "bogus CBLKCNT @ lcn %lu of nid %llu", - lcn, vi->nid); + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -434,7 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return 0; } - err = z_erofs_load_cluster_from_disk(m, lcn, true); + err = z_erofs_load_lcluster_from_disk(m, lcn, true); if (err) return err; @@ -481,7 +467,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false); + err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; @@ -539,8 +525,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; /* for non-compact indexes, fragmentoff is 64 bits */ - if (fragment && - vi->datalayout == EROFS_INODE_COMPRESSED_FULL) + if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL) vi->z_fragmentoff |= (u64)m.pblk << 32; } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { diff --git a/fs/eventfd.c b/fs/eventfd.c index 95850a13ce8d..8aa36cd37351 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -33,17 +33,17 @@ struct eventfd_ctx { /* * Every time that a write(2) is performed on an eventfd, the * value of the __u64 being written is added to "count" and a - * wakeup is performed on "wqh". A read(2) will return the "count" - * value to userspace, and will reset "count" to zero. The kernel - * side eventfd_signal() also, adds to the "count" counter and - * issue a wakeup. + * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not + * specified, a read(2) will return the "count" value to userspace, + * and will reset "count" to zero. The kernel side eventfd_signal() + * also, adds to the "count" counter and issue a wakeup. */ __u64 count; unsigned int flags; int id; }; -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) { unsigned long flags; @@ -301,6 +301,8 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) (unsigned long long)ctx->count); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "eventfd-id: %d\n", ctx->id); + seq_printf(m, "eventfd-semaphore: %d\n", + !!(ctx->flags & EFD_SEMAPHORE)); } #endif diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 266d45c7685b..4b1b3362f697 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -536,7 +536,7 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, - unsigned pollflags) + __poll_t pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 45b579805c95..0caf6c730ce3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3834,19 +3834,10 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, return retval; } - /* - * We need to protect against old.inode directory getting converted - * from inline directory format into a normal one. - */ - if (S_ISDIR(old.inode->i_mode)) - inode_lock_nested(old.inode, I_MUTEX_NONDIR2); - old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); - if (IS_ERR(old.bh)) { - retval = PTR_ERR(old.bh); - goto unlock_moved_dir; - } + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. @@ -4043,10 +4034,6 @@ release_bh: brelse(old.bh); brelse(new.bh); -unlock_moved_dir: - if (S_ISDIR(old.inode->i_mode)) - inode_unlock(old.inode); - return retval; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 77a71276ecb1..ad597b417fea 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -995,20 +995,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out; } - /* - * Copied from ext4_rename: we need to protect against old.inode - * directory getting converted from inline directory format into - * a normal one. - */ - if (S_ISDIR(old_inode->i_mode)) - inode_lock_nested(old_inode, I_MUTEX_NONDIR2); - err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) err = PTR_ERR(old_page); - goto out_unlock_old; + goto out; } if (S_ISDIR(old_inode->i_mode)) { @@ -1116,9 +1108,6 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_unlock_op(sbi); - if (S_ISDIR(old_inode->i_mode)) - inode_unlock(old_inode); - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); @@ -1133,9 +1122,6 @@ out_dir: f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); -out_unlock_old: - if (S_ISDIR(old_inode->i_mode)) - inode_unlock(old_inode); out: iput(whiteout); return err; diff --git a/fs/file_table.c b/fs/file_table.c index 372653b92617..e06c68e2d757 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -44,18 +44,40 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; +/* Container for backing file with optional real path */ +struct backing_file { + struct file file; + struct path real_path; +}; + +static inline struct backing_file *backing_file(struct file *f) +{ + return container_of(f, struct backing_file, file); +} + +struct path *backing_file_real_path(struct file *f) +{ + return &backing_file(f)->real_path; +} +EXPORT_SYMBOL_GPL(backing_file_real_path); + static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_rcuhead); put_cred(f->f_cred); - kmem_cache_free(filp_cachep, f); + if (unlikely(f->f_mode & FMODE_BACKING)) + kfree(backing_file(f)); + else + kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { security_file_free(f); - if (!(f->f_mode & FMODE_NOACCOUNT)) + if (unlikely(f->f_mode & FMODE_BACKING)) + path_put(backing_file_real_path(f)); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); call_rcu(&f->f_rcuhead, file_free_rcu); } @@ -131,20 +153,15 @@ static int __init init_fs_stat_sysctls(void) fs_initcall(init_fs_stat_sysctls); #endif -static struct file *__alloc_file(int flags, const struct cred *cred) +static int init_file(struct file *f, int flags, const struct cred *cred) { - struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); - if (unlikely(!f)) - return ERR_PTR(-ENOMEM); - f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { file_free_rcu(&f->f_rcuhead); - return ERR_PTR(error); + return error; } atomic_long_set(&f->f_count, 1); @@ -155,7 +172,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred) f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ - return f; + return 0; } /* Find an unused file structure and return a pointer to it. @@ -172,6 +189,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; + int error; /* * Privileged users can go above max_files @@ -185,9 +203,15 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = __alloc_file(flags, cred); - if (!IS_ERR(f)) - percpu_counter_inc(&nr_files); + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) + return ERR_PTR(error); + + percpu_counter_inc(&nr_files); return f; @@ -203,18 +227,51 @@ over: /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * - * Should not be used unless there's a very good reason to do so. + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { - struct file *f = __alloc_file(flags, cred); + struct file *f; + int error; - if (!IS_ERR(f)) - f->f_mode |= FMODE_NOACCOUNT; + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) + return ERR_PTR(error); + + f->f_mode |= FMODE_NOACCOUNT; return f; } +/* + * Variant of alloc_empty_file() that allocates a backing_file container + * and doesn't check and modify nr_files. + * + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. + */ +struct file *alloc_empty_backing_file(int flags, const struct cred *cred) +{ + struct backing_file *ff; + int error; + + ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); + if (unlikely(!ff)) + return ERR_PTR(-ENOMEM); + + error = init_file(&ff->file, flags, cred); + if (unlikely(error)) + return ERR_PTR(error); + + ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; + return &ff->file; +} + /** * alloc_file - allocate and initialize a 'struct file' * diff --git a/fs/fs_context.c b/fs/fs_context.c index 24ce12f0db32..851214d1d013 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -561,7 +561,8 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) return -ENOMEM; } - ctx->legacy_data[size++] = ','; + if (size) + ctx->legacy_data[size++] = ','; len = strlen(param->key); memcpy(ctx->legacy_data + size, param->key, len); size += len; diff --git a/fs/inode.c b/fs/inode.c index 577799b7855f..53ae3b76d232 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1104,9 +1104,51 @@ void discard_new_inode(struct inode *inode) EXPORT_SYMBOL(discard_new_inode); /** + * lock_two_inodes - lock two inodes (may be regular files but also dirs) + * + * Lock any non-NULL argument. The caller must make sure that if he is passing + * in two directories, one is not ancestor of the other. Zero, one or two + * objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + * @subclass1: inode lock subclass for the first lock obtained + * @subclass2: inode lock subclass for the second lock obtained + */ +void lock_two_inodes(struct inode *inode1, struct inode *inode2, + unsigned subclass1, unsigned subclass2) +{ + if (!inode1 || !inode2) { + /* + * Make sure @subclass1 will be used for the acquired lock. + * This is not strictly necessary (no current caller cares) but + * let's keep things consistent. + */ + if (!inode1) + swap(inode1, inode2); + goto lock; + } + + /* + * If one object is directory and the other is not, we must make sure + * to lock directory first as the other object may be its child. + */ + if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) { + if (inode1 > inode2) + swap(inode1, inode2); + } else if (!S_ISDIR(inode1->i_mode)) + swap(inode1, inode2); +lock: + if (inode1) + inode_lock_nested(inode1, subclass1); + if (inode2 && inode2 != inode1) + inode_lock_nested(inode2, subclass2); +} + +/** * lock_two_nondirectories - take two i_mutexes on non-directory objects * - * Lock any non-NULL argument that is not a directory. + * Lock any non-NULL argument. Passed objects must not be directories. * Zero, one or two objects may be locked by this function. * * @inode1: first inode to lock @@ -1114,13 +1156,9 @@ EXPORT_SYMBOL(discard_new_inode); */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - if (inode1 > inode2) - swap(inode1, inode2); - - if (inode1 && !S_ISDIR(inode1->i_mode)) - inode_lock(inode1); - if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - inode_lock_nested(inode2, I_MUTEX_NONDIR2); + WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); + WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); + lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -1131,10 +1169,14 @@ EXPORT_SYMBOL(lock_two_nondirectories); */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - if (inode1 && !S_ISDIR(inode1->i_mode)) + if (inode1) { + WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); inode_unlock(inode1); - if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + } + if (inode2 && inode2 != inode1) { + WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); inode_unlock(inode2); + } } EXPORT_SYMBOL(unlock_two_nondirectories); diff --git a/fs/internal.h b/fs/internal.h index bd3b2810a36b..f7a3dc111026 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -97,8 +97,9 @@ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ -extern struct file *alloc_empty_file(int, const struct cred *); -extern struct file *alloc_empty_file_noaccount(int, const struct cred *); +struct file *alloc_empty_file(int flags, const struct cred *cred); +struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); +struct file *alloc_empty_backing_file(int flags, const struct cred *cred); static inline void put_file_access(struct file *file) { @@ -121,6 +122,47 @@ extern bool mount_capable(struct fs_context *); int sb_init_dio_done_wq(struct super_block *sb); /* + * Prepare superblock for changing its read-only state (i.e., either remount + * read-write superblock read-only or vice versa). After this function returns + * mnt_is_readonly() will return true for any mount of the superblock if its + * caller is able to observe any changes done by the remount. This holds until + * sb_end_ro_state_change() is called. + */ +static inline void sb_start_ro_state_change(struct super_block *sb) +{ + WRITE_ONCE(sb->s_readonly_remount, 1); + /* + * For RO->RW transition, the barrier pairs with the barrier in + * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY + * cleared, it will see s_readonly_remount set. + * For RW->RO transition, the barrier pairs with the barrier in + * __mnt_want_write() before the mnt_is_readonly() check. The barrier + * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already + * cleared, it will see s_readonly_remount set. + */ + smp_wmb(); +} + +/* + * Ends section changing read-only state of the superblock. After this function + * returns if mnt_is_readonly() returns false, the caller will be able to + * observe all the changes remount did to the superblock. + */ +static inline void sb_end_ro_state_change(struct super_block *sb) +{ + /* + * This barrier provides release semantics that pairs with + * the smp_rmb() acquire semantics in mnt_is_readonly(). + * This barrier pair ensure that when mnt_is_readonly() sees + * 0 for sb->s_readonly_remount, it will also see all the + * preceding flag changes that were made during the RO state + * change. + */ + smp_wmb(); + WRITE_ONCE(sb->s_readonly_remount, 0); +} + +/* * open.c */ struct open_flags { @@ -152,6 +194,8 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); +void lock_two_inodes(struct inode *inode1, struct inode *inode2, + unsigned subclass1, unsigned subclass2); /* * fs-writeback.c diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 837cd55fd4c5..6ae9d6fefb86 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -211,7 +211,10 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) ic->scan_dents = NULL; cond_resched(); } - jffs2_build_xattr_subsystem(c); + ret = jffs2_build_xattr_subsystem(c); + if (ret) + goto exit; + c->flags &= ~JFFS2_SB_FLAG_BUILDING; dbg_fsbuild("FS build complete\n"); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index aa4048a27f31..3b6bdc9a49e1 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -772,10 +772,10 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) } #define XREF_TMPHASH_SIZE (128) -void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) +int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) { struct jffs2_xattr_ref *ref, *_ref; - struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE]; + struct jffs2_xattr_ref **xref_tmphash; struct jffs2_xattr_datum *xd, *_xd; struct jffs2_inode_cache *ic; struct jffs2_raw_node_ref *raw; @@ -784,9 +784,12 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING)); + xref_tmphash = kcalloc(XREF_TMPHASH_SIZE, + sizeof(struct jffs2_xattr_ref *), GFP_KERNEL); + if (!xref_tmphash) + return -ENOMEM; + /* Phase.1 : Merge same xref */ - for (i=0; i < XREF_TMPHASH_SIZE; i++) - xref_tmphash[i] = NULL; for (ref=c->xref_temp; ref; ref=_ref) { struct jffs2_xattr_ref *tmp; @@ -884,6 +887,8 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) "%u of xref (%u dead, %u orphan) found.\n", xdatum_count, xdatum_unchecked_count, xdatum_orphan_count, xref_count, xref_dead_count, xref_orphan_count); + kfree(xref_tmphash); + return 0; } struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 720007b2fd65..1b5030a3349d 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -71,7 +71,7 @@ static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref) #ifdef CONFIG_JFFS2_FS_XATTR extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c); -extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c); +extern int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c); extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c); extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, @@ -103,7 +103,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #else #define jffs2_init_xattr_subsystem(c) -#define jffs2_build_xattr_subsystem(c) +#define jffs2_build_xattr_subsystem(c) (0) #define jffs2_clear_xattr_subsystem(c) #define jffs2_xattr_do_crccheck_inode(c, ic) diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index b29d68b5eec5..494b9f4043cf 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -876,7 +876,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, tid_t tid; ino_t ino = 0; struct component_name dname; - int ssize; /* source pathname size */ + u32 ssize; /* source pathname size */ struct btstack btstack; struct inode *ip = d_inode(dentry); s64 xlen = 0; @@ -957,7 +957,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, if (ssize > sizeof (JFS_IP(ip)->i_inline)) JFS_IP(ip)->mode2 &= ~INLINEEA; - jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ", + jfs_info("jfs_symlink: fast symlink added ssize:%u name:%s ", ssize, name); } /* @@ -987,7 +987,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, ip->i_size = ssize - 1; while (ssize) { /* This is kind of silly since PATH_MAX == 4K */ - int copy_size = min(ssize, PSIZE); + u32 copy_size = min_t(u32, ssize, PSIZE); mp = get_metapage(ip, xaddr, PSIZE, 1); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 04ba95b83d16..22d3ff3818f5 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -355,7 +355,6 @@ static int lockd_get(void) int error; if (nlmsvc_serv) { - svc_get(nlmsvc_serv); nlmsvc_users++; return 0; } diff --git a/fs/namei.c b/fs/namei.c index e4fe0879ae55..91171da719c5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3028,8 +3028,8 @@ static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) return p; } - inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + lock_two_inodes(p1->d_inode, p2->d_inode, + I_MUTEX_PARENT, I_MUTEX_PARENT2); return NULL; } @@ -3703,7 +3703,7 @@ static int vfs_tmpfile(struct mnt_idmap *idmap, } /** - * vfs_tmpfile_open - open a tmpfile for kernel internal use + * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile @@ -3714,24 +3714,26 @@ static int vfs_tmpfile(struct mnt_idmap *idmap, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ -struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, - const struct path *parentpath, - umode_t mode, int open_flag, const struct cred *cred) +struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, + const struct path *parentpath, + umode_t mode, int open_flag, + const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); - if (!IS_ERR(file)) { - error = vfs_tmpfile(idmap, parentpath, file, mode); - if (error) { - fput(file); - file = ERR_PTR(error); - } + if (IS_ERR(file)) + return file; + + error = vfs_tmpfile(idmap, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); } return file; } -EXPORT_SYMBOL(vfs_tmpfile_open); +EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, @@ -4731,7 +4733,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we have to lock _four_ objects - parents and victim (if it exists), - * and source (if it is not a directory). + * and source. * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -4815,10 +4817,16 @@ int vfs_rename(struct renamedata *rd) take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); - if (!is_dir || (flags & RENAME_EXCHANGE)) - lock_two_nondirectories(source, target); - else if (target) - inode_lock(target); + /* + * Lock all moved children. Moved directories may need to change parent + * pointer so they need the lock to prevent against concurrent + * directory changes moving parent pointer. For regular files we've + * historically always done this. The lockdep locking subclasses are + * somewhat arbitrary but RENAME_EXCHANGE in particular can swap + * regular files and directories so it's difficult to tell which + * subclasses to use. + */ + lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) @@ -4866,9 +4874,9 @@ int vfs_rename(struct renamedata *rd) d_exchange(old_dentry, new_dentry); } out: - if (!is_dir || (flags & RENAME_EXCHANGE)) - unlock_two_nondirectories(source, target); - else if (target) + if (source) + inode_unlock(source); + if (target) inode_unlock(target); dput(new_dentry); if (!error) { diff --git a/fs/namespace.c b/fs/namespace.c index 54847db5b819..e157efc54023 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -309,9 +309,16 @@ static unsigned int mnt_get_writers(struct mount *mnt) static int mnt_is_readonly(struct vfsmount *mnt) { - if (mnt->mnt_sb->s_readonly_remount) + if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; - /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + /* + * The barrier pairs with the barrier in sb_start_ro_state_change() + * making sure if we don't see s_readonly_remount set yet, we also will + * not see any superblock / mount flag changes done by remount. + * It also pairs with the barrier in sb_end_ro_state_change() + * assuring that if we see s_readonly_remount already cleared, we will + * see the values of superblock / mount flags updated by remount. + */ smp_rmb(); return __mnt_is_readonly(mnt); } @@ -364,9 +371,11 @@ int __mnt_want_write(struct vfsmount *m) } } /* - * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will - * be set to match its requirements. So we must not load that until - * MNT_WRITE_HOLD is cleared. + * The barrier pairs with the barrier sb_start_ro_state_change() making + * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in + * mnt_is_readonly() and bail in case we are racing with remount + * read-only. */ smp_rmb(); if (mnt_is_readonly(m)) { @@ -588,10 +597,8 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err && atomic_long_read(&sb->s_remove_count)) err = -EBUSY; - if (!err) { - sb->s_readonly_remount = 1; - smp_wmb(); - } + if (!err) + sb_start_ro_state_change(sb); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; @@ -658,9 +665,25 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) return false; } -/* - * find the first mount at @dentry on vfsmount @mnt. - * call under rcu_read_lock() +/** + * __lookup_mnt - find first child mount + * @mnt: parent mount + * @dentry: mountpoint + * + * If @mnt has a child mount @c mounted @dentry find and return it. + * + * Note that the child mount @c need not be unique. There are cases + * where shadow mounts are created. For example, during mount + * propagation when a source mount @mnt whose root got overmounted by a + * mount @o after path lookup but before @namespace_sem could be + * acquired gets copied and propagated. So @mnt gets copied including + * @o. When @mnt is propagated to a destination mount @d that already + * has another mount @n mounted at the same mountpoint then the source + * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on + * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt + * on @dentry. + * + * Return: The first child of @mnt mounted @dentry or NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { @@ -910,6 +933,33 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +/** + * mnt_set_mountpoint_beneath - mount a mount beneath another one + * + * @new_parent: the source mount + * @top_mnt: the mount beneath which @new_parent is mounted + * @new_mp: the new mountpoint of @top_mnt on @new_parent + * + * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and + * parent @top_mnt->mnt_parent and mount it on top of @new_parent at + * @new_mp. And mount @new_parent on the old parent and old + * mountpoint of @top_mnt. + * + * Context: This function expects namespace_lock() and lock_mount_hash() + * to have been acquired in that order. + */ +static void mnt_set_mountpoint_beneath(struct mount *new_parent, + struct mount *top_mnt, + struct mountpoint *new_mp) +{ + struct mount *old_top_parent = top_mnt->mnt_parent; + struct mountpoint *old_top_mp = top_mnt->mnt_mp; + + mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); + mnt_change_mountpoint(new_parent, new_mp, top_mnt); +} + + static void __attach_mnt(struct mount *mnt, struct mount *parent) { hlist_add_head_rcu(&mnt->mnt_hash, @@ -917,15 +967,42 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent) list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } -/* - * vfsmount lock must be held for write +/** + * attach_mnt - mount a mount, attach to @mount_hashtable and parent's + * list of child mounts + * @parent: the parent + * @mnt: the new mount + * @mp: the new mountpoint + * @beneath: whether to mount @mnt beneath or on top of @parent + * + * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * to @parent's child mount list and to @mount_hashtable. + * + * If @beneath is true, remove @mnt from its current parent and + * mountpoint and mount it on @mp on @parent, and mount @parent on the + * old parent and old mountpoint of @mnt. Finally, attach @parent to + * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. + * + * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * to the correct parent. + * + * Context: This function expects namespace_lock() and lock_mount_hash() + * to have been acquired in that order. */ -static void attach_mnt(struct mount *mnt, - struct mount *parent, - struct mountpoint *mp) +static void attach_mnt(struct mount *mnt, struct mount *parent, + struct mountpoint *mp, bool beneath) { - mnt_set_mountpoint(parent, mp, mnt); - __attach_mnt(mnt, parent); + if (beneath) + mnt_set_mountpoint_beneath(mnt, parent, mp); + else + mnt_set_mountpoint(parent, mp, mnt); + /* + * Note, @mnt->mnt_parent has to be used. If @mnt was mounted + * beneath @parent then @mnt will need to be attached to + * @parent's old parent, not @parent. IOW, @mnt->mnt_parent + * isn't the same mount as @parent. + */ + __attach_mnt(mnt, mnt->mnt_parent); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) @@ -937,7 +1014,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); - attach_mnt(mnt, parent, mp); + attach_mnt(mnt, parent, mp, false); put_mountpoint(old_mp); mnt_add_count(old_parent, -1); @@ -1767,6 +1844,19 @@ bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } +/** + * path_mounted - check whether path is mounted + * @path: path to check + * + * Determine whether @path refers to the root of a mount. + * + * Return: true if @path is the root of a mount, false if not. + */ +static inline bool path_mounted(const struct path *path) +{ + return path->mnt->mnt_root == path->dentry; +} + static void warn_mandlock(void) { pr_warn_once("=======================================================\n" @@ -1782,7 +1872,7 @@ static int can_umount(const struct path *path, int flags) if (!may_mount()) return -EPERM; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!check_mnt(mnt)) return -EINVAL; @@ -1925,7 +2015,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp); + attach_mnt(q, parent, p->mnt_mp, false); unlock_mount_hash(); } } @@ -2134,12 +2224,17 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) return 0; } -/* - * @source_mnt : mount tree to be attached - * @nd : place the mount tree @source_mnt is attached - * @parent_nd : if non-null, detach the source_mnt from its parent and - * store the parent mount and mountpoint dentry. - * (done when source_mnt is moved) +enum mnt_tree_flags_t { + MNT_TREE_MOVE = BIT(0), + MNT_TREE_BENEATH = BIT(1), +}; + +/** + * attach_recursive_mnt - attach a source mount tree + * @source_mnt: mount tree to be attached + * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath + * @dest_mp: the mountpoint @source_mnt will be mounted at + * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2196,22 +2291,28 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. + * + * Context: The function expects namespace_lock() to be held. + * Return: If @source_mnt was successfully attached 0 is returned. + * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *dest_mnt, - struct mountpoint *dest_mp, - bool moving) + struct mount *top_mnt, + struct mountpoint *dest_mp, + enum mnt_tree_flags_t flags) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); - struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mnt_namespace *ns = top_mnt->mnt_ns; struct mountpoint *smp; - struct mount *child, *p; + struct mount *child, *dest_mnt, *p; struct hlist_node *n; - int err; + int err = 0; + bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; - /* Preallocate a mountpoint in case the new mounts need - * to be tucked under other mounts. + /* + * Preallocate a mountpoint in case the new mounts need to be + * mounted beneath mounts on the same mountpoint. */ smp = get_mountpoint(source_mnt->mnt.mnt_root); if (IS_ERR(smp)) @@ -2224,29 +2325,41 @@ static int attach_recursive_mnt(struct mount *source_mnt, goto out; } + if (beneath) + dest_mnt = top_mnt->mnt_parent; + else + dest_mnt = top_mnt; + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); - lock_mount_hash(); - if (err) - goto out_cleanup_ids; + } + lock_mount_hash(); + if (err) + goto out_cleanup_ids; + + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); - } else { - lock_mount_hash(); } + if (moving) { + if (beneath) + dest_mp = smp; unhash_mnt(source_mnt); - attach_mnt(source_mnt, dest_mnt, dest_mp); + attach_mnt(source_mnt, top_mnt, dest_mp, beneath); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { /* move from anon - the caller will destroy */ list_del_init(&source_mnt->mnt_ns->list); } - mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + if (beneath) + mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); + else + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -2286,33 +2399,101 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } -static struct mountpoint *lock_mount(struct path *path) +/** + * do_lock_mount - lock mount and mountpoint + * @path: target path + * @beneath: whether the intention is to mount beneath @path + * + * Follow the mount stack on @path until the top mount @mnt is found. If + * the initial @path->{mnt,dentry} is a mountpoint lookup the first + * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} + * until nothing is stacked on top of it anymore. + * + * Acquire the inode_lock() on the top mount's ->mnt_root to protect + * against concurrent removal of the new mountpoint from another mount + * namespace. + * + * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint + * @mp on @mnt->mnt_parent must be acquired. This protects against a + * concurrent unlink of @mp->mnt_dentry from another mount namespace + * where @mnt doesn't have a child mount mounted @mp. A concurrent + * removal of @mnt->mnt_root doesn't matter as nothing will be mounted + * on top of it for @beneath. + * + * In addition, @beneath needs to make sure that @mnt hasn't been + * unmounted or moved from its current mountpoint in between dropping + * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt + * being unmounted would be detected later by e.g., calling + * check_mnt(mnt) in the function it's called from. For the @beneath + * case however, it's useful to detect it directly in do_lock_mount(). + * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points + * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will + * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. + * + * Return: Either the target mountpoint on the top mount or the top + * mount's mountpoint. + */ +static struct mountpoint *do_lock_mount(struct path *path, bool beneath) { - struct vfsmount *mnt; - struct dentry *dentry = path->dentry; -retry: - inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - return ERR_PTR(-ENOENT); - } - namespace_lock(); - mnt = lookup_mnt(path); - if (likely(!mnt)) { - struct mountpoint *mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { + struct vfsmount *mnt = path->mnt; + struct dentry *dentry; + struct mountpoint *mp = ERR_PTR(-ENOENT); + + for (;;) { + struct mount *m; + + if (beneath) { + m = real_mount(mnt); + read_seqlock_excl(&mount_lock); + dentry = dget(m->mnt_mountpoint); + read_sequnlock_excl(&mount_lock); + } else { + dentry = path->dentry; + } + + inode_lock(dentry->d_inode); + if (unlikely(cant_mount(dentry))) { + inode_unlock(dentry->d_inode); + goto out; + } + + namespace_lock(); + + if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { namespace_unlock(); inode_unlock(dentry->d_inode); - return mp; + goto out; } - return mp; + + mnt = lookup_mnt(path); + if (likely(!mnt)) + break; + + namespace_unlock(); + inode_unlock(dentry->d_inode); + if (beneath) + dput(dentry); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); } - namespace_unlock(); - inode_unlock(path->dentry->d_inode); - path_put(path); - path->mnt = mnt; - dentry = path->dentry = dget(mnt->mnt_root); - goto retry; + + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + } + +out: + if (beneath) + dput(dentry); + + return mp; +} + +static inline struct mountpoint *lock_mount(struct path *path) +{ + return do_lock_mount(path, false); } static void unlock_mount(struct mountpoint *where) @@ -2336,7 +2517,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, false); + return attach_recursive_mnt(mnt, p, mp, 0); } /* @@ -2367,7 +2548,7 @@ static int do_change_type(struct path *path, int ms_flags) int type; int err = 0; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; type = flags_to_propagation_type(ms_flags); @@ -2643,7 +2824,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) if (!check_mnt(mnt)) return -EINVAL; - if (path->dentry != mnt->mnt.mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) @@ -2682,7 +2863,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (!check_mnt(mnt)) return -EINVAL; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) @@ -2772,9 +2953,9 @@ static int do_set_group(struct path *from_path, struct path *to_path) err = -EINVAL; /* To and From paths should be mount roots */ - if (from_path->dentry != from_path->mnt->mnt_root) + if (!path_mounted(from_path)) goto out; - if (to_path->dentry != to_path->mnt->mnt_root) + if (!path_mounted(to_path)) goto out; /* Setting sharing groups is only allowed across same superblock */ @@ -2818,7 +2999,110 @@ out: return err; } -static int do_move_mount(struct path *old_path, struct path *new_path) +/** + * path_overmounted - check if path is overmounted + * @path: path to check + * + * Check if path is overmounted, i.e., if there's a mount on top of + * @path->mnt with @path->dentry as mountpoint. + * + * Context: This function expects namespace_lock() to be held. + * Return: If path is overmounted true is returned, false if not. + */ +static inline bool path_overmounted(const struct path *path) +{ + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, path->dentry))) { + rcu_read_unlock(); + return true; + } + rcu_read_unlock(); + return false; +} + +/** + * can_move_mount_beneath - check that we can mount beneath the top mount + * @from: mount to mount beneath + * @to: mount under which to mount + * + * - Make sure that @to->dentry is actually the root of a mount under + * which we can mount another mount. + * - Make sure that nothing can be mounted beneath the caller's current + * root or the rootfs of the namespace. + * - Make sure that the caller can unmount the topmost mount ensuring + * that the caller could reveal the underlying mountpoint. + * - Ensure that nothing has been mounted on top of @from before we + * grabbed @namespace_sem to avoid creating pointless shadow mounts. + * - Prevent mounting beneath a mount if the propagation relationship + * between the source mount, parent mount, and top mount would lead to + * nonsensical mount trees. + * + * Context: This function expects namespace_lock() to be held. + * Return: On success 0, and on error a negative error code is returned. + */ +static int can_move_mount_beneath(const struct path *from, + const struct path *to, + const struct mountpoint *mp) +{ + struct mount *mnt_from = real_mount(from->mnt), + *mnt_to = real_mount(to->mnt), + *parent_mnt_to = mnt_to->mnt_parent; + + if (!mnt_has_parent(mnt_to)) + return -EINVAL; + + if (!path_mounted(to)) + return -EINVAL; + + if (IS_MNT_LOCKED(mnt_to)) + return -EINVAL; + + /* Avoid creating shadow mounts during mount propagation. */ + if (path_overmounted(from)) + return -EINVAL; + + /* + * Mounting beneath the rootfs only makes sense when the + * semantics of pivot_root(".", ".") are used. + */ + if (&mnt_to->mnt == current->fs->root.mnt) + return -EINVAL; + if (parent_mnt_to == current->nsproxy->mnt_ns->root) + return -EINVAL; + + for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) + if (p == mnt_to) + return -EINVAL; + + /* + * If the parent mount propagates to the child mount this would + * mean mounting @mnt_from on @mnt_to->mnt_parent and then + * propagating a copy @c of @mnt_from on top of @mnt_to. This + * defeats the whole purpose of mounting beneath another mount. + */ + if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) + return -EINVAL; + + /* + * If @mnt_to->mnt_parent propagates to @mnt_from this would + * mean propagating a copy @c of @mnt_from on top of @mnt_from. + * Afterwards @mnt_from would be mounted on top of + * @mnt_to->mnt_parent and @mnt_to would be unmounted from + * @mnt->mnt_parent and remounted on @mnt_from. But since @c is + * already mounted on @mnt_from, @mnt_to would ultimately be + * remounted on top of @c. Afterwards, @mnt_from would be + * covered by a copy @c of @mnt_from and @c would be covered by + * @mnt_from itself. This defeats the whole purpose of mounting + * @mnt_from beneath @mnt_to. + */ + if (propagation_would_overmount(parent_mnt_to, mnt_from, mp)) + return -EINVAL; + + return 0; +} + +static int do_move_mount(struct path *old_path, struct path *new_path, + bool beneath) { struct mnt_namespace *ns; struct mount *p; @@ -2827,8 +3111,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path) struct mountpoint *mp, *old_mp; int err; bool attached; + enum mnt_tree_flags_t flags = 0; - mp = lock_mount(new_path); + mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) return PTR_ERR(mp); @@ -2836,6 +3121,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path) p = real_mount(new_path->mnt); parent = old->mnt_parent; attached = mnt_has_parent(old); + if (attached) + flags |= MNT_TREE_MOVE; old_mp = old->mnt_mp; ns = old->mnt_ns; @@ -2855,7 +3142,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path) if (old->mnt.mnt_flags & MNT_LOCKED) goto out; - if (old_path->dentry != old_path->mnt->mnt_root) + if (!path_mounted(old_path)) goto out; if (d_is_dir(new_path->dentry) != @@ -2866,6 +3153,17 @@ static int do_move_mount(struct path *old_path, struct path *new_path) */ if (attached && IS_MNT_SHARED(parent)) goto out; + + if (beneath) { + err = can_move_mount_beneath(old_path, new_path, mp); + if (err) + goto out; + + err = -EINVAL; + p = p->mnt_parent; + flags |= MNT_TREE_BENEATH; + } + /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. @@ -2879,8 +3177,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path) if (p == old) goto out; - err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, - attached); + err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); if (err) goto out; @@ -2912,7 +3209,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path); + err = do_move_mount(&old_path, path, false); path_put(&old_path); return err; } @@ -2937,8 +3234,7 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, } /* Refuse the same filesystem on the same mount point */ - if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && - path->mnt->mnt_root == path->dentry) + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path)) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) @@ -3079,13 +3375,10 @@ int finish_automount(struct vfsmount *m, const struct path *path) err = -ENOENT; goto discard_locked; } - rcu_read_lock(); - if (unlikely(__lookup_mnt(path->mnt, dentry))) { - rcu_read_unlock(); + if (path_overmounted(path)) { err = 0; goto discard_locked; } - rcu_read_unlock(); mp = get_mountpoint(dentry); if (IS_ERR(mp)) { err = PTR_ERR(mp); @@ -3777,6 +4070,10 @@ SYSCALL_DEFINE5(move_mount, if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; + if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) == + (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) + return -EINVAL; + /* If someone gives a pathname, they aren't permitted to move * from an fd that requires unmount as we can't get at the flag * to clear it afterwards. @@ -3806,7 +4103,8 @@ SYSCALL_DEFINE5(move_mount, if (flags & MOVE_MOUNT_SET_GROUP) ret = do_set_group(&from_path, &to_path); else - ret = do_move_mount(&from_path, &to_path); + ret = do_move_mount(&from_path, &to_path, + (flags & MOVE_MOUNT_BENEATH)); out_to: path_put(&to_path); @@ -3917,11 +4215,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; - if (root.mnt->mnt_root != root.dentry) + if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ - if (new.mnt->mnt_root != new.dentry) + if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ @@ -3939,9 +4237,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount old root on put_old */ - attach_mnt(root_mnt, old_mnt, old_mp); + attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ - attach_mnt(new_mnt, root_parent, root_mp); + attach_mnt(new_mnt, root_parent, root_mp, false); mnt_add_count(root_parent, -1); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ @@ -4124,7 +4422,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) struct mount *mnt = real_mount(path->mnt); int err = 0; - if (path->dentry != mnt->mnt.mnt_root) + if (!path_mounted(path)) return -EINVAL; if (kattr->mnt_userns) { diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index f21259ead64b..4c9b87850ab1 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -80,6 +80,8 @@ enum { int nfsd_drc_slab_create(void); void nfsd_drc_slab_free(void); +int nfsd_net_reply_cache_init(struct nfsd_net *nn); +void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ae85257b4238..11a0eaa2f914 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -97,7 +97,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) goto out; err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; @@ -107,7 +107,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) dprintk("found domain %s\n", buf); err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; fsidtype = simple_strtoul(buf, &ep, 10); if (*ep) @@ -593,7 +593,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ char *buf; - int len; int err; struct auth_domain *dom = NULL; struct svc_export exp = {}, *expp; @@ -609,8 +608,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* client */ err = -EINVAL; - len = qword_get(&mesg, buf, PAGE_SIZE); - if (len <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; @@ -620,7 +618,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* path */ err = -EINVAL; - if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out1; err = kern_path(buf, 0, &exp.ex_path); @@ -665,7 +663,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out3; exp.ex_fsid = an_int; - while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { + while (qword_get(&mesg, buf, PAGE_SIZE) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index e6bb8eeb5bc2..fc8d5b7db9f8 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -151,8 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - unsigned int len; - int v; dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), @@ -166,17 +164,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp) if (argp->offset + argp->count > (u64)OFFSET_MAX) argp->count = (u64)OFFSET_MAX - argp->offset; - v = 0; - len = argp->count; resp->pages = rqstp->rq_next_page; - while (len > 0) { - struct page *page = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(page); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) @@ -187,7 +175,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, v, &resp->count, &resp->eof); + &resp->count, &resp->eof); return rpc_success; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 3308dd671ef0..f32128955ec8 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -828,7 +828,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->len) < 0) return false; - xdr_write_pages(xdr, resp->pages, 0, resp->len); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, 0, + resp->len); if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) return false; break; @@ -859,8 +860,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->count) < 0) return false; - xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, - resp->count); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, + rqstp->rq_res.page_base, + resp->count); if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) return false; break; @@ -961,7 +963,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (!svcxdr_encode_cookieverf3(xdr, resp->verf)) return false; - xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0, + dirlist->len); /* no more entries */ if (xdr_stream_encode_item_absent(xdr) < 0) return false; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 76db2fe29624..26b1343c8035 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2541,6 +2541,20 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, return p; } +static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr, + struct timespec64 *tv) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 3); + if (!p) + return nfserr_resource; + + p = xdr_encode_hyper(p, (s64)tv->tv_sec); + *p = cpu_to_be32(tv->tv_nsec); + return nfs_ok; +} + /* * ctime (in NFSv4, time_metadata) is not writeable, and the client * doesn't really care what resolution could theoretically be stored by @@ -2566,12 +2580,16 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode) return p; } -static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) +static __be32 +nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c) { - *p++ = cpu_to_be32(c->atomic); - p = xdr_encode_hyper(p, c->before_change); - p = xdr_encode_hyper(p, c->after_change); - return p; + if (xdr_stream_encode_bool(xdr, c->atomic) < 0) + return nfserr_resource; + if (xdr_stream_encode_u64(xdr, c->before_change) < 0) + return nfserr_resource; + if (xdr_stream_encode_u64(xdr, c->after_change) < 0) + return nfserr_resource; + return nfs_ok; } /* Encode as an array of strings the string given with components @@ -3348,11 +3366,9 @@ out_acl: p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); - *p++ = cpu_to_be32(stat.atime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.atime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_TIME_DELTA) { p = xdr_reserve_space(xdr, 12); @@ -3361,25 +3377,19 @@ out_acl: p = encode_time_delta(p, d_inode(dentry)); } if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); - *p++ = cpu_to_be32(stat.ctime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.ctime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); - *p++ = cpu_to_be32(stat.mtime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.mtime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_TIME_CREATE) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec); - *p++ = cpu_to_be32(stat.btime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.btime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { u64 ino = stat.ino; @@ -3689,6 +3699,30 @@ fail: } static __be32 +nfsd4_encode_verifier4(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + memcpy(p, verf->data, sizeof(verf->data)); + return nfs_ok; +} + +static __be32 +nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(__be64)); + if (!p) + return nfserr_resource; + memcpy(p, clientid, sizeof(*clientid)); + return nfs_ok; +} + +static __be32 nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) { __be32 *p; @@ -3752,15 +3786,8 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_commit *commit = &u->commit; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, commit->co_verf.data, - NFS4_VERIFIER_SIZE); - return 0; + return nfsd4_encode_verifier4(resp->xdr, &commit->co_verf); } static __be32 @@ -3769,12 +3796,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_create *create = &u->create; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - encode_cinfo(p, &create->cr_cinfo); + nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo); + if (nfserr) + return nfserr; return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], create->cr_bmval[1], create->cr_bmval[2]); } @@ -3892,13 +3917,8 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_link *link = &u->link; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &link->li_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &link->li_cinfo); } @@ -3913,11 +3933,11 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); if (nfserr) return nfserr; - p = xdr_reserve_space(xdr, 24); - if (!p) + nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo); + if (nfserr) + return nfserr; + if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0) return nfserr_resource; - p = encode_cinfo(p, &open->op_cinfo); - *p++ = cpu_to_be32(open->op_rflags); nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], open->op_bmval[2]); @@ -3956,7 +3976,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, p = xdr_reserve_space(xdr, 32); if (!p) return nfserr_resource; - *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(open->op_recall); /* * TODO: space_limit's in delegations @@ -4018,6 +4038,11 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, return nfsd4_encode_stateid(xdr, &od->od_stateid); } +/* + * The operation of this function assumes that this is the only + * READ operation in the COMPOUND. If there are multiple READs, + * we use nfsd4_encode_readv(). + */ static __be32 nfsd4_encode_splice_read( struct nfsd4_compoundres *resp, struct nfsd4_read *read, @@ -4028,8 +4053,12 @@ static __be32 nfsd4_encode_splice_read( int status, space_left; __be32 nfserr; - /* Make sure there will be room for padding if needed */ - if (xdr->end - xdr->p < 1) + /* + * Make sure there is room at the end of buf->head for + * svcxdr_encode_opaque_pages() to create a tail buffer + * to XDR-pad the payload. + */ + if (xdr->iov != xdr->buf->head || xdr->end - xdr->p < 1) return nfserr_resource; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, @@ -4038,6 +4067,8 @@ static __be32 nfsd4_encode_splice_read( read->rd_length = maxcount; if (nfserr) goto out_err; + svcxdr_encode_opaque_pages(read->rd_rqstp, xdr, buf->pages, + buf->page_base, maxcount); status = svc_encode_result_payload(read->rd_rqstp, buf->head[0].iov_len, maxcount); if (status) { @@ -4045,31 +4076,19 @@ static __be32 nfsd4_encode_splice_read( goto out_err; } - buf->page_len = maxcount; - buf->len += maxcount; - xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1) - / PAGE_SIZE; - - /* Use rest of head for padding and remaining ops: */ - buf->tail[0].iov_base = xdr->p; - buf->tail[0].iov_len = 0; - xdr->iov = buf->tail; - if (maxcount&3) { - int pad = 4 - (maxcount&3); - - *(xdr->p++) = 0; - - buf->tail[0].iov_base += maxcount&3; - buf->tail[0].iov_len = pad; - buf->len += pad; - } - + /* + * Prepare to encode subsequent operations. + * + * xdr_truncate_encode() is not safe to use after a successful + * splice read has been done, so the following stream + * manipulations are open-coded. + */ space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, buf->buflen - buf->len); buf->buflen = buf->len + space_left; xdr->end = (__be32 *)((void *)xdr->end + space_left); - return 0; + return nfs_ok; out_err: /* @@ -4090,13 +4109,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, __be32 zero = xdr_zero; __be32 nfserr; - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount); - if (read->rd_vlen < 0) + if (xdr_reserve_space_vec(xdr, maxcount) < 0) return nfserr_resource; - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount, - &read->rd_eof); + nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file, + read->rd_offset, &maxcount, + xdr->buf->page_len & ~PAGE_MASK, + &read->rd_eof); read->rd_length = maxcount; if (nfserr) return nfserr; @@ -4213,15 +4232,9 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, int starting_len = xdr->buf->len; __be32 *p; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - - /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - xdr->buf->head[0].iov_len = (char *)xdr->p - - (char *)xdr->buf->head[0].iov_base; + nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf); + if (nfserr != nfs_ok) + return nfserr; /* * Number of bytes left for directory entries allowing for the @@ -4299,13 +4312,8 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_remove *remove = &u->remove; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &remove->rm_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &remove->rm_cinfo); } static __be32 @@ -4314,14 +4322,11 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_rename *rename = &u->rename; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 40); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &rename->rn_sinfo); - p = encode_cinfo(p, &rename->rn_tinfo); - return 0; + nfserr = nfsd4_encode_change_info4(xdr, &rename->rn_sinfo); + if (nfserr) + return nfserr; + return nfsd4_encode_change_info4(xdr, &rename->rn_tinfo); } static __be32 @@ -4448,23 +4453,25 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_setclientid *scd = &u->setclientid; struct xdr_stream *xdr = resp->xdr; - __be32 *p; if (!nfserr) { - p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); - p = xdr_encode_opaque_fixed(p, &scd->se_confirm, - NFS4_VERIFIER_SIZE); - } - else if (nfserr == nfserr_clid_inuse) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); + nfserr = nfsd4_encode_clientid4(xdr, &scd->se_clientid); + if (nfserr != nfs_ok) + goto out; + nfserr = nfsd4_encode_verifier4(xdr, &scd->se_confirm); + } else if (nfserr == nfserr_clid_inuse) { + /* empty network id */ + if (xdr_stream_encode_u32(xdr, 0) < 0) { + nfserr = nfserr_resource; + goto out; + } + /* empty universal address */ + if (xdr_stream_encode_u32(xdr, 0) < 0) { + nfserr = nfserr_resource; + goto out; + } } +out: return nfserr; } @@ -4473,17 +4480,12 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_write *write = &u->write; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 16); - if (!p) + if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0) return nfserr_resource; - *p++ = cpu_to_be32(write->wr_bytes_written); - *p++ = cpu_to_be32(write->wr_how_written); - p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, - NFS4_VERIFIER_SIZE); - return 0; + if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0) + return nfserr_resource; + return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier); } static __be32 @@ -4505,20 +4507,15 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, server_scope = nn->nfsd_name; server_scope_sz = strlen(nn->nfsd_name); - p = xdr_reserve_space(xdr, - 8 /* eir_clientid */ + - 4 /* eir_sequenceid */ + - 4 /* eir_flags */ + - 4 /* spr_how */); - if (!p) + if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, exid->seqid) < 0) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, exid->flags) < 0) return nfserr_resource; - p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); - *p++ = cpu_to_be32(exid->seqid); - *p++ = cpu_to_be32(exid->flags); - - *p++ = cpu_to_be32(exid->spa_how); - + if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0) + return nfserr_resource; switch (exid->spa_how) { case SP4_NONE: break; @@ -5099,15 +5096,8 @@ nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_setxattr *setxattr = &u->setxattr; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - - encode_cinfo(p, &setxattr->setxa_cinfo); - - return 0; + return nfsd4_encode_change_info4(xdr, &setxattr->setxa_cinfo); } /* @@ -5253,14 +5243,8 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_removexattr *removexattr = &u->removexattr; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - - p = encode_cinfo(p, &removexattr->rmxa_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &removexattr->rmxa_cinfo); } typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u); @@ -5460,6 +5444,12 @@ status: release: if (opdesc && opdesc->op_release) opdesc->op_release(&op->u); + + /* + * Account for pages consumed while encoding this operation. + * The xdr_stream primitives don't manage rq_next_page. + */ + rqstp->rq_next_page = xdr->page_ptr + 1; } /* @@ -5528,9 +5518,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr) p = resp->statusp; *p++ = resp->cstate.status; - - rqstp->rq_next_page = xdr->page_ptr + 1; - *p++ = htonl(resp->taglen); memcpy(p, resp->tag, resp->taglen); p += XDR_QUADLEN(resp->taglen); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 041faa13b852..a8eda1c85829 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -148,12 +148,23 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } -static int nfsd_reply_cache_stats_init(struct nfsd_net *nn) +/** + * nfsd_net_reply_cache_init - per net namespace reply cache set-up + * @nn: nfsd_net being initialized + * + * Returns zero on succes; otherwise a negative errno is returned. + */ +int nfsd_net_reply_cache_init(struct nfsd_net *nn) { return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); } -static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn) +/** + * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down + * @nn: nfsd_net being freed + * + */ +void nfsd_net_reply_cache_destroy(struct nfsd_net *nn) { nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); } @@ -169,17 +180,13 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); - status = nfsd_reply_cache_stats_init(nn); - if (status) - goto out_nomem; - nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; status = register_shrinker(&nn->nfsd_reply_cache_shrinker, "nfsd-reply:%s", nn->nfsd_name); if (status) - goto out_stats_destroy; + return status; nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); @@ -195,9 +202,6 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: unregister_shrinker(&nn->nfsd_reply_cache_shrinker); -out_stats_destroy: - nfsd_reply_cache_stats_destroy(nn); -out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; } @@ -217,7 +221,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) rp, nn); } } - nfsd_reply_cache_stats_destroy(nn); kvfree(nn->drc_hashtbl); nn->drc_hashtbl = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b4fd7a7062d5..1b8b1aab9a15 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -25,6 +25,7 @@ #include "netns.h" #include "pnfs.h" #include "filecache.h" +#include "trace.h" /* * We have a single directory with several nodes in it. @@ -109,12 +110,12 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu if (IS_ERR(data)) return PTR_ERR(data); - rv = write_op[ino](file, data, size); - if (rv >= 0) { - simple_transaction_set(file, rv); - rv = size; - } - return rv; + rv = write_op[ino](file, data, size); + if (rv < 0) + return rv; + + simple_transaction_set(file, rv); + return size; } static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) @@ -230,6 +231,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (rpc_pton(net, fo_path, size, sap, salen) == 0) return -EINVAL; + trace_nfsd_ctl_unlock_ip(net, buf); return nlmsvc_unlock_all_by_ip(sap); } @@ -263,7 +265,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) fo_path = buf; if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - + trace_nfsd_ctl_unlock_fs(netns(file), fo_path); error = kern_path(fo_path, 0, &path); if (error) return error; @@ -324,7 +326,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) len = qword_get(&mesg, dname, size); if (len <= 0) return -EINVAL; - + path = dname+len+1; len = qword_get(&mesg, path, size); if (len <= 0) @@ -338,15 +340,17 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) return -EINVAL; maxsize = min(maxsize, NFS3_FHSIZE); - if (qword_get(&mesg, mesg, size)>0) + if (qword_get(&mesg, mesg, size) > 0) return -EINVAL; + trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize); + /* we have all the words, they are in buf.. */ dom = unix_domain_find(dname); if (!dom) return -ENOMEM; - len = exp_rootfh(netns(file), dom, path, &fh, maxsize); + len = exp_rootfh(netns(file), dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; @@ -399,6 +403,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return rv; if (newthreads < 0) return -EINVAL; + trace_nfsd_ctl_threads(net, newthreads); rv = nfsd_svc(newthreads, net, file->f_cred); if (rv < 0) return rv; @@ -418,8 +423,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: C string containing whitespace- - * separated unsigned integer values + * buf: C string containing whitespace- + * separated unsigned integer values * representing the number of NFSD * threads to start in each pool * size: non-zero length of C string in @buf @@ -471,6 +476,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) rv = -EINVAL; if (nthreads[i] < 0) goto out_free; + trace_nfsd_ctl_pool_threads(net, i, nthreads[i]); } rv = nfsd_set_nrthreads(i, nthreads, net); if (rv) @@ -526,7 +532,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) char *sep; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); - if (size>0) { + if (size > 0) { if (nn->nfsd_serv) /* Cannot change versions without updating * nn->nfsd_serv->sv_xdrsize, and reallocing @@ -536,6 +542,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; + trace_nfsd_ctl_version(netns(file), buf); vers = mesg; len = qword_get(&mesg, vers, size); @@ -637,11 +644,11 @@ out: * OR * * Input: - * buf: C string containing whitespace- - * separated positive or negative - * integer values representing NFS - * protocol versions to enable ("+n") - * or disable ("-n") + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") * size: non-zero length of C string in @buf * Output: * On success: status of zero or more protocol versions has @@ -689,6 +696,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred err = get_int(&mesg, &fd); if (err != 0 || fd < 0) return -EINVAL; + trace_nfsd_ctl_ports_addfd(net, fd); err = nfsd_create_serv(net); if (err != 0) @@ -705,7 +713,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred } /* - * A transport listener is added by writing it's transport name and + * A transport listener is added by writing its transport name and * a port number. */ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred) @@ -720,6 +728,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (port < 1 || port > USHRT_MAX) return -EINVAL; + trace_nfsd_ctl_ports_addxprt(net, transport, port); err = nfsd_create_serv(net); if (err != 0) @@ -832,9 +841,9 @@ int nfsd_max_blksize; * OR * * Input: - * buf: C string containing an unsigned - * integer value representing the new - * NFS blksize + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string @@ -853,6 +862,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) int rv = get_int(&mesg, &bsize); if (rv) return rv; + trace_nfsd_ctl_maxblksize(netns(file), bsize); + /* force bsize into allowed range and * required alignment. */ @@ -881,9 +892,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: C string containing an unsigned - * integer value representing the new - * number of max connections + * buf: C string containing an unsigned + * integer value representing the new + * number of max connections * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string @@ -903,6 +914,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size) if (rv) return rv; + trace_nfsd_ctl_maxconn(netns(file), maxconn); nn->max_connections = maxconn; } @@ -913,6 +925,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size) static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time64_t *time, struct nfsd_net *nn) { + struct dentry *dentry = file_dentry(file); char *mesg = buf; int rv, i; @@ -922,6 +935,9 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, rv = get_int(&mesg, &i); if (rv) return rv; + trace_nfsd_ctl_time(netns(file), dentry->d_name.name, + dentry->d_name.len, i); + /* * Some sanity checking. We don't have a reason for * these particular numbers, but problems with the @@ -1014,6 +1030,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, len = qword_get(&mesg, recdir, size); if (len <= 0) return -EINVAL; + trace_nfsd_ctl_recoverydir(netns(file), recdir); status = nfs4_reset_recoverydir(recdir); if (status) @@ -1065,7 +1082,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: any value + * buf: any value * size: non-zero length of C string in @buf * Output: * passed-in buffer filled with "Y" or "N" with a newline @@ -1087,7 +1104,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case '1': if (!nn->nfsd_serv) return -EBUSY; - nfsd4_end_grace(nn); + trace_nfsd_end_grace(netns(file)); break; default: return -EINVAL; @@ -1192,8 +1209,8 @@ static int __nfsd_symlink(struct inode *dir, struct dentry *dentry, * @content is assumed to be a NUL-terminated string that lives * longer than the symlink itself. */ -static void nfsd_symlink(struct dentry *parent, const char *name, - const char *content) +static void _nfsd_symlink(struct dentry *parent, const char *name, + const char *content) { struct inode *dir = parent->d_inode; struct dentry *dentry; @@ -1210,8 +1227,8 @@ out: inode_unlock(dir); } #else -static inline void nfsd_symlink(struct dentry *parent, const char *name, - const char *content) +static inline void _nfsd_symlink(struct dentry *parent, const char *name, + const char *content) { } @@ -1389,8 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) ret = simple_fill_super(sb, 0x6e667364, nfsd_files); if (ret) return ret; - nfsd_symlink(sb->s_root, "supported_krb5_enctypes", - "/proc/net/rpc/gss_krb5_enctypes"); + _nfsd_symlink(sb->s_root, "supported_krb5_enctypes", + "/proc/net/rpc/gss_krb5_enctypes"); dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1477,7 +1494,17 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; -static __net_init int nfsd_init_net(struct net *net) +/** + * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace + * @net: a freshly-created network namespace + * + * This information stays around as long as the network namespace is + * alive whether or not there is an NFSD instance running in the + * namespace. + * + * Returns zero on success, or a negative errno otherwise. + */ +static __net_init int nfsd_net_init(struct net *net) { int retval; struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -1488,6 +1515,9 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; + retval = nfsd_net_reply_cache_init(nn); + if (retval) + goto out_repcache_error; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); @@ -1496,22 +1526,32 @@ static __net_init int nfsd_init_net(struct net *net) return 0; +out_repcache_error: + nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); out_export_error: return retval; } -static __net_exit void nfsd_exit_net(struct net *net) +/** + * nfsd_net_exit - Release the nfsd_net portion of a net namespace + * @net: a network namespace that is about to be destroyed + * + */ +static __net_exit void nfsd_net_exit(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfsd_net_reply_cache_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); - nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); + nfsd_netns_free_versions(nn); } static struct pernet_operations nfsd_net_ops = { - .init = nfsd_init_net, - .exit = nfsd_exit_net, + .init = nfsd_net_init, + .exit = nfsd_net_exit, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), }; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ccd8485fee04..e8e13ae72e3c 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -623,16 +623,9 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - if (v4 && IS_I_VERSION(inode)) { - stat.change_cookie = inode_query_iversion(inode); - stat.result_mask |= STATX_CHANGE_COOKIE; - } - } + if (err) + return; + if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -660,15 +653,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - if (v4 && IS_I_VERSION(inode)) { - fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); - fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; - } - } else - fhp->fh_post_saved = true; + if (err) + return; + + fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index c37195572fd0..a7315928a760 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -176,9 +176,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) { struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; - unsigned int len; u32 eof; - int v; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -187,17 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); - v = 0; - len = argp->count; resp->pages = rqstp->rq_next_page; - while (len > 0) { - struct page *page = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(page); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } /* Obtain buffer pointer for payload. 19 is 1 word for * status, 17 words for fattr, and 1 word for the byte count. @@ -207,7 +195,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) resp->count = argp->count; fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, v, &resp->count, &eof); + &resp->count, &eof); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9c7b1ef5be40..2154fa63c5f2 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -402,6 +402,11 @@ void nfsd_reset_write_verifier(struct nfsd_net *nn) write_sequnlock(&nn->writeverf_lock); } +/* + * Crank up a set of per-namespace resources for a new NFSD instance, + * including lockd, a duplicate reply cache, an open file cache + * instance, and a cache of NFSv4 state objects. + */ static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index caf6355b18fa..5777f40c7353 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -468,7 +468,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) case nfs_ok: if (xdr_stream_encode_u32(xdr, resp->len) < 0) return false; - xdr_write_pages(xdr, &resp->page, 0, resp->len); + svcxdr_encode_opaque_pages(rqstp, xdr, &resp->page, 0, + resp->len); if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) return false; break; @@ -491,8 +492,9 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->count) < 0) return false; - xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, - resp->count); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, + rqstp->rq_res.page_base, + resp->count); if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) return false; break; @@ -511,7 +513,8 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; switch (resp->status) { case nfs_ok: - xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0, + dirlist->len); /* no more entries */ if (xdr_stream_encode_item_absent(xdr) < 0) return false; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 72a906a053dc..2af74983f146 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1581,6 +1581,265 @@ TRACE_EVENT(nfsd_cb_recall_any_done, ) ); +TRACE_EVENT(nfsd_ctl_unlock_ip, + TP_PROTO( + const struct net *net, + const char *address + ), + TP_ARGS(net, address), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(address, address) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(address, address); + ), + TP_printk("address=%s", + __get_str(address) + ) +); + +TRACE_EVENT(nfsd_ctl_unlock_fs, + TP_PROTO( + const struct net *net, + const char *path + ), + TP_ARGS(net, path), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(path, path) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(path, path); + ), + TP_printk("path=%s", + __get_str(path) + ) +); + +TRACE_EVENT(nfsd_ctl_filehandle, + TP_PROTO( + const struct net *net, + const char *domain, + const char *path, + int maxsize + ), + TP_ARGS(net, domain, path, maxsize), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, maxsize) + __string(domain, domain) + __string(path, path) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->maxsize = maxsize; + __assign_str(domain, domain); + __assign_str(path, path); + ), + TP_printk("domain=%s path=%s maxsize=%d", + __get_str(domain), __get_str(path), __entry->maxsize + ) +); + +TRACE_EVENT(nfsd_ctl_threads, + TP_PROTO( + const struct net *net, + int newthreads + ), + TP_ARGS(net, newthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, newthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->newthreads = newthreads; + ), + TP_printk("newthreads=%d", + __entry->newthreads + ) +); + +TRACE_EVENT(nfsd_ctl_pool_threads, + TP_PROTO( + const struct net *net, + int pool, + int nrthreads + ), + TP_ARGS(net, pool, nrthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, pool) + __field(int, nrthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->pool = pool; + __entry->nrthreads = nrthreads; + ), + TP_printk("pool=%d nrthreads=%d", + __entry->pool, __entry->nrthreads + ) +); + +TRACE_EVENT(nfsd_ctl_version, + TP_PROTO( + const struct net *net, + const char *mesg + ), + TP_ARGS(net, mesg), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(mesg, mesg) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(mesg, mesg); + ), + TP_printk("%s", + __get_str(mesg) + ) +); + +TRACE_EVENT(nfsd_ctl_ports_addfd, + TP_PROTO( + const struct net *net, + int fd + ), + TP_ARGS(net, fd), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, fd) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->fd = fd; + ), + TP_printk("fd=%d", + __entry->fd + ) +); + +TRACE_EVENT(nfsd_ctl_ports_addxprt, + TP_PROTO( + const struct net *net, + const char *transport, + int port + ), + TP_ARGS(net, transport, port), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, port) + __string(transport, transport) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->port = port; + __assign_str(transport, transport); + ), + TP_printk("transport=%s port=%d", + __get_str(transport), __entry->port + ) +); + +TRACE_EVENT(nfsd_ctl_maxblksize, + TP_PROTO( + const struct net *net, + int bsize + ), + TP_ARGS(net, bsize), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, bsize) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->bsize = bsize; + ), + TP_printk("bsize=%d", + __entry->bsize + ) +); + +TRACE_EVENT(nfsd_ctl_maxconn, + TP_PROTO( + const struct net *net, + int maxconn + ), + TP_ARGS(net, maxconn), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, maxconn) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->maxconn = maxconn; + ), + TP_printk("maxconn=%d", + __entry->maxconn + ) +); + +TRACE_EVENT(nfsd_ctl_time, + TP_PROTO( + const struct net *net, + const char *name, + size_t namelen, + int time + ), + TP_ARGS(net, name, namelen, time), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, time) + __string_len(name, name, namelen) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->time = time; + __assign_str_len(name, name, namelen); + ), + TP_printk("file=%s time=%d\n", + __get_str(name), __entry->time + ) +); + +TRACE_EVENT(nfsd_ctl_recoverydir, + TP_PROTO( + const struct net *net, + const char *recdir + ), + TP_ARGS(net, recdir), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(recdir, recdir) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(recdir, recdir); + ), + TP_printk("recdir=%s", + __get_str(recdir) + ) +); + +TRACE_EVENT(nfsd_end_grace, + TP_PROTO( + const struct net *net + ), + TP_ARGS(net), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + ), + TP_printk("nn=%d", __entry->netns_ino + ) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index db67f8e19344..59b7d60ae33e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -388,7 +388,9 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) iap->ia_mode &= ~S_ISGID; } else { /* set ATTR_KILL_* bits and let VFS handle it */ - iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); + iap->ia_valid |= ATTR_KILL_SUID; + iap->ia_valid |= + setattr_should_drop_sgid(&nop_mnt_idmap, inode); } } } @@ -1001,6 +1003,18 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } } +/** + * nfsd_splice_read - Perform a VFS read using a splice pipe + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @file: opened struct file of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @eof: OUT: set non-zero if operation reached the end of the file + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. + */ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, u32 *eof) @@ -1014,22 +1028,50 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); - rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } -__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count, - u32 *eof) +/** + * nfsd_iter_read - Perform a VFS read using an iterator + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @file: opened struct file of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @base: offset in first page of read buffer + * @eof: OUT: set non-zero if operation reached the end of the file + * + * Some filesystems or situations cannot use nfsd_splice_read. This + * function is the slightly less-performant fallback for those cases. + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. + */ +__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, unsigned long *count, + unsigned int base, u32 *eof) { + unsigned long v, total; struct iov_iter iter; loff_t ppos = offset; + struct page *page; ssize_t host_err; + v = 0; + total = *count; + while (total) { + page = *(rqstp->rq_next_page++); + rqstp->rq_vec[v].iov_base = page_address(page) + base; + rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base); + total -= rqstp->rq_vec[v].iov_len; + ++v; + base = 0; + } + WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec)); + trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count); + iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1159,14 +1201,24 @@ out_nfserr: return nfserr; } -/* - * Read data from a file. count must contain the requested read count - * on entry. On return, *count contains the number of bytes actually read. +/** + * nfsd_read - Read data from a file + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @eof: OUT: set non-zero if operation reached the end of the file + * + * The caller must verify that there is enough space in @rqstp.rq_res + * to perform this operation. + * * N.B. After this call fhp needs an fh_put + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count, - u32 *eof) + loff_t offset, unsigned long *count, u32 *eof) { struct nfsd_file *nf; struct file *file; @@ -1181,12 +1233,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); + err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof); nfsd_file_put(nf); - trace_nfsd_read_done(rqstp, fhp, offset, *count); - return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 43fb57a301d3..a6890ea7b765 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -110,13 +110,12 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, u32 *eof); -__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, +__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - struct kvec *vec, int vlen, - unsigned long *count, + unsigned long *count, unsigned int base, u32 *eof); -__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *, +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, unsigned long *count, u32 *eof); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5cf30827f244..b4e54d079b7d 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) struct folio *folio = fbatch.folios[i]; folio_lock(folio); - nilfs_clear_dirty_page(&folio->page, silent); + + /* + * This folio may have been removed from the address + * space by truncation or invalidation when the lock + * was acquired. Skip processing in that case. + */ + if (likely(folio->mapping == mapping)) + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } folio_batch_release(&fbatch); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1362ccb64ec7..6e59dc19a732 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) if (unlikely(!bh)) return -ENOMEM; + lock_buffer(bh); + if (!buffer_uptodate(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); nilfs_segbuf_add_segsum_buffer(segbuf, bh); return 0; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ac949fd7603f..c2553024bd25 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -981,10 +981,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + + lock_buffer(bh_sr); raw_sr = (struct nilfs_super_root *)bh_sr->b_data; isz = nilfs->ns_inode_size; srsz = NILFS_SR_BYTES(isz); + raw_sr->sr_sum = 0; /* Ensure initialization within this update */ raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? @@ -998,6 +1001,8 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz), 1); memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); + set_buffer_uptodate(bh_sr); + unlock_buffer(bh_sr); } static void nilfs_redirty_inodes(struct list_head *head) @@ -1780,6 +1785,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1791,6 +1797,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) b_assoc_buffers) { clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 77f1e5778d1c..9ba4933087af 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) goto out; } nsbp = (void *)nsbh->b_data + offset; - memset(nsbp, 0, nilfs->ns_blocksize); + lock_buffer(nsbh); if (sb2i >= 0) { + /* + * The position of the second superblock only changes by 4KiB, + * which is larger than the maximum superblock data size + * (= 1KiB), so there is no need to use memmove() to allow + * overlap between source and destination. + */ memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + + /* + * Zero fill after copy to avoid overwriting in case of move + * within the same block. + */ + memset(nsbh->b_data, 0, offset); + memset((void *)nsbp + nilfs->ns_sbsize, 0, + nsbh->b_size - offset - nilfs->ns_sbsize); + } else { + memset(nsbh->b_data, 0, nsbh->b_size); + } + set_buffer_uptodate(nsbh); + unlock_buffer(nsbh); + + if (sb2i >= 0) { brelse(nilfs->ns_sbh[sb2i]); nilfs->ns_sbh[sb2i] = nsbh; nilfs->ns_sbp[sb2i] = nsbp; diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index a3865bc4a0c6..f79408f9127a 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -2491,7 +2491,7 @@ conv_err_out: * byte offset @ofs inside the attribute with the constant byte @val. * * This function is effectively like memset() applied to an ntfs attribute. - * Note thie function actually only operates on the page cache pages belonging + * Note this function actually only operates on the page cache pages belonging * to the ntfs attribute and it marks them dirty after doing the memset(). * Thus it relies on the vm dirty page write code paths to cause the modified * pages to be written to the mft record/disk. diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index f9cb180b6f6b..761aaa0195d6 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -161,7 +161,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], */ u8 *cb_end = cb_start + cb_size; /* End of cb. */ u8 *cb = cb_start; /* Current position in cb. */ - u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */ + u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ /* Variables for uncompressed data / destination. */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 48030899dc6e..0155f106ec34 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1955,36 +1955,38 @@ undo_alloc: "attribute.%s", es); NVolSetErrors(vol); } - a = ctx->attr; + if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { ntfs_error(vol->sb, "Failed to truncate mft data attribute " "runlist.%s", es); NVolSetErrors(vol); } - if (mp_rebuilt && !IS_ERR(ctx->mrec)) { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + if (ctx) { + a = ctx->attr; + if (mp_rebuilt && !IS_ERR(ctx->mrec)) { + if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu( a->data.non_resident.mapping_pairs_offset), old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), + a->data.non_resident.mapping_pairs_offset), rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " + ntfs_error(vol->sb, "Failed to restore mapping pairs " "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } else if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to restore attribute search " + "context.%s", es); NVolSetErrors(vol); } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } else if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to restore attribute search " - "context.%s", es); - NVolSetErrors(vol); - } - if (ctx) ntfs_attr_put_search_ctx(ctx); + } if (!IS_ERR(mrec)) unmap_mft_record(mft_ni); up_write(&mft_ni->runlist.lock); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 2643a08182e1..56a7d5bd33e4 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1620,7 +1620,7 @@ read_partial_attrdef_page: memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), page_address(page), size); ntfs_unmap_page(page); - }; + } if (size == PAGE_SIZE) { size = i_size & ~PAGE_MASK; if (size) @@ -1689,7 +1689,7 @@ read_partial_upcase_page: memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), page_address(page), size); ntfs_unmap_page(page); - }; + } if (size == PAGE_SIZE) { size = i_size & ~PAGE_MASK; if (size) diff --git a/fs/open.c b/fs/open.c index 4478adcc4f3a..fb07b2840eb4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -700,10 +700,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return do_fchmodat(AT_FDCWD, filename, mode); } -/** - * setattr_vfsuid - check and set ia_fsuid attribute - * @kuid: new inode owner - * +/* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * @@ -718,10 +715,7 @@ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) return true; } -/** - * setattr_vfsgid - check and set ia_fsgid attribute - * @kgid: new inode owner - * +/* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * @@ -989,7 +983,6 @@ cleanup_file: * @file: file pointer * @dentry: pointer to dentry * @open: open callback - * @opened: state of open * * This can be used to finish opening a file passed to i_op->atomic_open(). * @@ -1043,7 +1036,6 @@ EXPORT_SYMBOL(file_path); * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized - * @cred: credentials to use */ int vfs_open(const struct path *path, struct file *file) { @@ -1116,23 +1108,77 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, } EXPORT_SYMBOL(dentry_create); -struct file *open_with_fake_path(const struct path *path, int flags, +/** + * kernel_file_open - open a file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @inode: the inode + * @cred: credentials for open + * + * Open a file for use by in-kernel consumers. The file is not accounted + * against nr_files and must not be installed into the file descriptor + * table. + * + * Return: Opened file on success, an error pointer on failure. + */ +struct file *kernel_file_open(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { - struct file *f = alloc_empty_file_noaccount(flags, cred); - if (!IS_ERR(f)) { - int error; + struct file *f; + int error; - f->f_path = *path; - error = do_dentry_open(f, inode, NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } + f = alloc_empty_file_noaccount(flags, cred); + if (IS_ERR(f)) + return f; + + f->f_path = *path; + error = do_dentry_open(f, inode, NULL); + if (error) { + fput(f); + f = ERR_PTR(error); } return f; } -EXPORT_SYMBOL(open_with_fake_path); +EXPORT_SYMBOL_GPL(kernel_file_open); + +/** + * backing_file_open - open a backing file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @path may be on the stackable filesystem and backing inode on the + * underlying filesystem. In this case, we want to be able to return + * the @real_path of the backing inode. This is done by embedding the + * returned file into a container structure that also stores the path of + * the backing inode on the underlying filesystem, which can be + * retrieved using backing_file_real_path(). + */ +struct file *backing_file_open(const struct path *path, int flags, + const struct path *real_path, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + f->f_path = *path; + path_get(real_path); + *backing_file_real_path(f) = *real_path; + error = do_dentry_open(f, d_inode(real_path->dentry), NULL); + if (error) { + fput(f); + f = ERR_PTR(error); + } + + return f; +} +EXPORT_SYMBOL_GPL(backing_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) @@ -1156,7 +1202,7 @@ inline struct open_how build_open_how(int flags, umode_t mode) inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; - u64 strip = FMODE_NONOTIFY | O_CLOEXEC; + u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 7c04f033aadd..dbbb156c2d67 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -34,8 +34,8 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } -/* No atime modification nor notify on underlying */ -#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) +/* No atime modification on underlying */ +#define OVL_OPEN_FLAGS (O_NOATIME) static struct file *ovl_open_realfile(const struct file *file, const struct path *realpath) @@ -61,8 +61,8 @@ static struct file *ovl_open_realfile(const struct file *file, if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; - realfile = open_with_fake_path(&file->f_path, flags, realinode, - current_cred()); + realfile = backing_file_open(&file->f_path, flags, realpath, + current_cred()); } revert_creds(old_cred); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4d0b278f5630..23686e8a06c4 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -329,8 +329,9 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; - struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode, - O_LARGEFILE | O_WRONLY, current_cred()); + struct file *file = kernel_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, + mode, O_LARGEFILE | O_WRONLY, + current_cred()); int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); diff --git a/fs/pnode.c b/fs/pnode.c index 3cede8b18c8b..e4d0340393d5 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -216,7 +216,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct hlist_head *list; -static inline bool peers(struct mount *m1, struct mount *m2) +static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } @@ -354,6 +354,46 @@ static inline int do_refcount_check(struct mount *mnt, int count) return mnt_get_count(mnt) > count; } +/** + * propagation_would_overmount - check whether propagation from @from + * would overmount @to + * @from: shared mount + * @to: mount to check + * @mp: future mountpoint of @to on @from + * + * If @from propagates mounts to @to, @from and @to must either be peers + * or one of the masters in the hierarchy of masters of @to must be a + * peer of @from. + * + * If the root of the @to mount is equal to the future mountpoint @mp of + * the @to mount on @from then @to will be overmounted by whatever is + * propagated to it. + * + * Context: This function expects namespace_lock() to be held and that + * @mp is stable. + * Return: If @from overmounts @to, true is returned, false if not. + */ +bool propagation_would_overmount(const struct mount *from, + const struct mount *to, + const struct mountpoint *mp) +{ + if (!IS_MNT_SHARED(from)) + return false; + + if (IS_MNT_NEW(to)) + return false; + + if (to->mnt.mnt_root != mp->m_dentry) + return false; + + for (const struct mount *m = to; m; m = m->mnt_master) { + if (peers(from, m)) + return true; + } + + return false; +} + /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount diff --git a/fs/pnode.h b/fs/pnode.h index 988f1aa9b02a..0b02a6393891 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -53,4 +53,7 @@ struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); int count_mounts(struct mnt_namespace *ns, struct mount *mnt); +bool propagation_would_overmount(const struct mount *from, + const struct mount *to, + const struct mountpoint *mp); #endif /* _LINUX_PNODE_H */ diff --git a/fs/readdir.c b/fs/readdir.c index 9c53edb60c03..b264ce60114d 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -131,7 +131,7 @@ struct old_linux_dirent { unsigned long d_ino; unsigned long d_offset; unsigned short d_namlen; - char d_name[1]; + char d_name[]; }; struct readdir_callback { @@ -208,7 +208,7 @@ struct linux_dirent { unsigned long d_ino; unsigned long d_off; unsigned short d_reclen; - char d_name[1]; + char d_name[]; }; struct getdents_callback { @@ -388,7 +388,7 @@ struct compat_old_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_offset; unsigned short d_namlen; - char d_name[1]; + char d_name[]; }; struct compat_readdir_callback { @@ -460,7 +460,7 @@ struct compat_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_off; unsigned short d_reclen; - char d_name[1]; + char d_name[]; }; struct compat_getdents_callback { diff --git a/fs/remap_range.c b/fs/remap_range.c index 1331a890f2f2..87ae4f0dc3aa 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -15,6 +15,7 @@ #include <linux/mount.h> #include <linux/fs.h> #include <linux/dax.h> +#include <linux/overflow.h> #include "internal.h" #include <linux/uaccess.h> @@ -101,10 +102,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { + loff_t tmp; + if (unlikely(pos < 0 || len < 0)) return -EINVAL; - if (unlikely((loff_t) (pos + len) < 0)) + if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; return security_file_permission(file, write ? MAY_WRITE : MAY_READ); diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index f9b2e0f19b03..ced7a9e916f0 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -185,24 +185,31 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, goto send; } - if (conn->ops->check_user_session) { - rc = conn->ops->check_user_session(work); - if (rc < 0) { - command = conn->ops->get_cmd_val(work); - conn->ops->set_rsp_status(work, - STATUS_USER_SESSION_DELETED); - goto send; - } else if (rc > 0) { - rc = conn->ops->get_ksmbd_tcon(work); + do { + if (conn->ops->check_user_session) { + rc = conn->ops->check_user_session(work); if (rc < 0) { - conn->ops->set_rsp_status(work, - STATUS_NETWORK_NAME_DELETED); + if (rc == -EINVAL) + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + else + conn->ops->set_rsp_status(work, + STATUS_USER_SESSION_DELETED); goto send; + } else if (rc > 0) { + rc = conn->ops->get_ksmbd_tcon(work); + if (rc < 0) { + if (rc == -EINVAL) + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + else + conn->ops->set_rsp_status(work, + STATUS_NETWORK_NAME_DELETED); + goto send; + } } } - } - do { rc = __process_request(work, conn, &command); if (rc == SERVER_HANDLER_ABORT) break; diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 0ffe663b7590..33b7e6c4ceff 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -351,9 +351,16 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) int command; __u32 clc_len; /* calculated length */ __u32 len = get_rfc1002_len(work->request_buf); + __u32 req_struct_size, next_cmd = le32_to_cpu(hdr->NextCommand); - if (le32_to_cpu(hdr->NextCommand) > 0) - len = le32_to_cpu(hdr->NextCommand); + if ((u64)work->next_smb2_rcv_hdr_off + next_cmd > len) { + pr_err("next command(%u) offset exceeds smb msg size\n", + next_cmd); + return 1; + } + + if (next_cmd > 0) + len = next_cmd; else if (work->next_smb2_rcv_hdr_off) len -= work->next_smb2_rcv_hdr_off; @@ -373,17 +380,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } if (smb2_req_struct_sizes[command] != pdu->StructureSize2) { - if (command != SMB2_OPLOCK_BREAK_HE && - (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) { - /* error packets have 9 byte structure size */ - ksmbd_debug(SMB, - "Illegal request size %u for command %d\n", - le16_to_cpu(pdu->StructureSize2), command); - return 1; - } else if (command == SMB2_OPLOCK_BREAK_HE && - hdr->Status == 0 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { + if (command == SMB2_OPLOCK_BREAK_HE && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { /* special case for SMB2.1 lease break message */ ksmbd_debug(SMB, "Illegal request size %d for oplock break\n", @@ -392,6 +391,14 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } } + req_struct_size = le16_to_cpu(pdu->StructureSize2) + + __SMB2_HEADER_STRUCTURE_SIZE; + if (command == SMB2_LOCK_HE) + req_struct_size -= sizeof(struct smb2_lock_element); + + if (req_struct_size > len + 1) + return 1; + if (smb2_calc_size(hdr, &clc_len)) return 1; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 25c0ba04c59d..da1787c68ba0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -91,7 +91,6 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) unsigned int cmd = le16_to_cpu(req_hdr->Command); int tree_id; - work->tcon = NULL; if (cmd == SMB2_TREE_CONNECT_HE || cmd == SMB2_CANCEL_HE || cmd == SMB2_LOGOFF_HE) { @@ -105,10 +104,28 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) } tree_id = le32_to_cpu(req_hdr->Id.SyncId.TreeId); + + /* + * If request is not the first in Compound request, + * Just validate tree id in header with work->tcon->id. + */ + if (work->next_smb2_rcv_hdr_off) { + if (!work->tcon) { + pr_err("The first operation in the compound does not have tcon\n"); + return -EINVAL; + } + if (work->tcon->id != tree_id) { + pr_err("tree id(%u) is different with id(%u) in first operation\n", + tree_id, work->tcon->id); + return -EINVAL; + } + return 1; + } + work->tcon = ksmbd_tree_conn_lookup(work->sess, tree_id); if (!work->tcon) { pr_err("Invalid tid %d\n", tree_id); - return -EINVAL; + return -ENOENT; } return 1; @@ -547,7 +564,6 @@ int smb2_check_user_session(struct ksmbd_work *work) unsigned int cmd = conn->ops->get_cmd_val(work); unsigned long long sess_id; - work->sess = NULL; /* * SMB2_ECHO, SMB2_NEGOTIATE, SMB2_SESSION_SETUP command do not * require a session id, so no need to validate user session's for @@ -558,15 +574,33 @@ int smb2_check_user_session(struct ksmbd_work *work) return 0; if (!ksmbd_conn_good(conn)) - return -EINVAL; + return -EIO; sess_id = le64_to_cpu(req_hdr->SessionId); + + /* + * If request is not the first in Compound request, + * Just validate session id in header with work->sess->id. + */ + if (work->next_smb2_rcv_hdr_off) { + if (!work->sess) { + pr_err("The first operation in the compound does not have sess\n"); + return -EINVAL; + } + if (work->sess->id != sess_id) { + pr_err("session id(%llu) is different with the first operation(%lld)\n", + sess_id, work->sess->id); + return -EINVAL; + } + return 1; + } + /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); if (work->sess) return 1; ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); - return -EINVAL; + return -ENOENT; } static void destroy_previous_session(struct ksmbd_conn *conn, @@ -2249,7 +2283,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* delete the EA only when it exits */ if (rc > 0) { rc = ksmbd_vfs_remove_xattr(idmap, - path->dentry, + path, attr_name); if (rc < 0) { @@ -2263,8 +2297,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* if the EA doesn't exist, just do nothing. */ rc = 0; } else { - rc = ksmbd_vfs_setxattr(idmap, - path->dentry, attr_name, value, + rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value, le16_to_cpu(eabuf->EaValueLength), 0); if (rc < 0) { ksmbd_debug(SMB, @@ -2321,8 +2354,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, return -EBADF; } - rc = ksmbd_vfs_setxattr(idmap, path->dentry, - xattr_stream_name, NULL, 0, 0); + rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0); if (rc < 0) pr_err("Failed to store XATTR stream name :%d\n", rc); return 0; @@ -2350,7 +2382,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, path->dentry, + err = ksmbd_vfs_remove_xattr(idmap, path, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", @@ -2397,8 +2429,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path * da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), - path->dentry, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da); if (rc) ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); } @@ -2972,7 +3003,7 @@ int smb2_open(struct ksmbd_work *work) struct inode *inode = d_inode(path.dentry); posix_acl_rc = ksmbd_vfs_inherit_posix_acl(idmap, - path.dentry, + &path, d_inode(path.dentry->d_parent)); if (posix_acl_rc) ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc); @@ -2988,7 +3019,7 @@ int smb2_open(struct ksmbd_work *work) if (rc) { if (posix_acl_rc) ksmbd_vfs_set_init_posix_acl(idmap, - path.dentry); + &path); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { @@ -3028,7 +3059,7 @@ int smb2_open(struct ksmbd_work *work) rc = ksmbd_vfs_set_sd_xattr(conn, idmap, - path.dentry, + &path, pntsd, pntsd_size); kfree(pntsd); @@ -5464,7 +5495,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp), - fp->filp->f_path.dentry, + &fp->filp->f_path, xattr_stream_name, NULL, 0, 0); if (rc < 0) { @@ -5629,8 +5660,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, - filp->f_path.dentry, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da); if (rc) ksmbd_debug(SMB, "failed to restore file attribute in EA\n"); @@ -7485,7 +7515,7 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, da.attr = le32_to_cpu(fp->f_ci->m_fattr); ret = ksmbd_vfs_set_dos_attrib_xattr(idmap, - fp->filp->f_path.dentry, &da); + &fp->filp->f_path, &da); if (ret) fp->f_ci->m_fattr = old_fattr; } diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 0a5862a61c77..ad919a4239d0 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1162,8 +1162,7 @@ pass: pntsd_size += sizeof(struct smb_acl) + nt_size; } - ksmbd_vfs_set_sd_xattr(conn, idmap, - path->dentry, pntsd, pntsd_size); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size); kfree(pntsd); } @@ -1383,7 +1382,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - ksmbd_vfs_remove_acl_xattrs(idmap, path->dentry); + ksmbd_vfs_remove_acl_xattrs(idmap, path); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { rc = set_posix_acl(idmap, path->dentry, @@ -1414,9 +1413,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { /* Update WinACL in xattr */ - ksmbd_vfs_remove_sd_xattrs(idmap, path->dentry); - ksmbd_vfs_set_sd_xattr(conn, idmap, - path->dentry, pntsd, ntsd_len); + ksmbd_vfs_remove_sd_xattrs(idmap, path); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len); } out: diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index f9fb778247e7..81489fdedd8e 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -170,6 +170,10 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + err = mnt_want_write(path.mnt); + if (err) + goto out_err; + mode |= S_IFREG; err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), dentry, mode, true); @@ -179,6 +183,9 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } else { pr_err("File(%s): creation failed (err:%d)\n", name, err); } + mnt_drop_write(path.mnt); + +out_err: done_path_create(&path, dentry); return err; } @@ -209,30 +216,35 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + err = mnt_want_write(path.mnt); + if (err) + goto out_err2; + idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); - if (err) { - goto out; - } else if (d_unhashed(dentry)) { + if (!err && d_unhashed(dentry)) { struct dentry *d; d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); - goto out; + goto out_err1; } if (unlikely(d_is_negative(d))) { dput(d); err = -ENOENT; - goto out; + goto out_err1; } ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); dput(d); } -out: + +out_err1: + mnt_drop_write(path.mnt); +out_err2: done_path_create(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); @@ -443,7 +455,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, memcpy(&stream_buf[*pos], buf, count); err = ksmbd_vfs_setxattr(idmap, - fp->filp->f_path.dentry, + &fp->filp->f_path, fp->stream.name, (void *)stream_buf, size, @@ -589,6 +601,10 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) goto out_err; } + err = mnt_want_write(path->mnt); + if (err) + goto out_err; + idmap = mnt_idmap(path->mnt); if (S_ISDIR(d_inode(path->dentry)->i_mode)) { err = vfs_rmdir(idmap, d_inode(parent), path->dentry); @@ -599,6 +615,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) if (err) ksmbd_debug(VFS, "unlink failed, err %d\n", err); } + mnt_drop_write(path->mnt); out_err: ksmbd_revert_fsids(work); @@ -644,11 +661,16 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out3; } + err = mnt_want_write(newpath.mnt); + if (err) + goto out3; + err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt), d_inode(newpath.dentry), dentry, NULL); if (err) ksmbd_debug(VFS, "vfs_link failed err %d\n", err); + mnt_drop_write(newpath.mnt); out3: done_path_create(&newpath, dentry); @@ -694,6 +716,10 @@ retry: goto out2; } + err = mnt_want_write(old_path->mnt); + if (err) + goto out2; + trap = lock_rename_child(old_child, new_path.dentry); old_parent = dget(old_child->d_parent); @@ -757,6 +783,7 @@ out4: out3: dput(old_parent); unlock_rename(old_parent, new_path.dentry); + mnt_drop_write(old_path->mnt); out2: path_put(&new_path); @@ -897,19 +924,24 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, * Return: 0 on success, otherwise error */ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *attr_name, + const struct path *path, const char *attr_name, void *attr_value, size_t attr_size, int flags) { int err; + err = mnt_want_write(path->mnt); + if (err) + return err; + err = vfs_setxattr(idmap, - dentry, + path->dentry, attr_name, attr_value, attr_size, flags); if (err) ksmbd_debug(VFS, "setxattr failed, err %d\n", err); + mnt_drop_write(path->mnt); return err; } @@ -1013,9 +1045,18 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, } int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, char *attr_name) + const struct path *path, char *attr_name) { - return vfs_removexattr(idmap, dentry, attr_name); + int err; + + err = mnt_want_write(path->mnt); + if (err) + return err; + + err = vfs_removexattr(idmap, path->dentry, attr_name); + mnt_drop_write(path->mnt); + + return err; } int ksmbd_vfs_unlink(struct file *filp) @@ -1024,6 +1065,10 @@ int ksmbd_vfs_unlink(struct file *filp) struct dentry *dir, *dentry = filp->f_path.dentry; struct mnt_idmap *idmap = file_mnt_idmap(filp); + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + dir = dget_parent(dentry); err = ksmbd_vfs_lock_parent(dir, dentry); if (err) @@ -1041,6 +1086,7 @@ int ksmbd_vfs_unlink(struct file *filp) ksmbd_debug(VFS, "failed to delete, err %d\n", err); out: dput(dir); + mnt_drop_write(filp->f_path.mnt); return err; } @@ -1244,13 +1290,13 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, } int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry) + const struct path *path) { char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; - xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; } else if (!xattr_list_len) { @@ -1258,6 +1304,10 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, goto out; } + err = mnt_want_write(path->mnt); + if (err) + goto out; + for (name = xattr_list; name - xattr_list < xattr_list_len; name += strlen(name) + 1) { ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); @@ -1266,25 +1316,26 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = vfs_remove_acl(idmap, dentry, name); + err = vfs_remove_acl(idmap, path->dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); } } + mnt_drop_write(path->mnt); + out: kvfree(xattr_list); return err; } -int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry) +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path) { char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; - xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; } else if (!xattr_list_len) { @@ -1297,7 +1348,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, dentry, name); + err = ksmbd_vfs_remove_xattr(idmap, path, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } @@ -1374,13 +1425,14 @@ out: int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct smb_ntsd *pntsd, int len) { int rc; struct ndr sd_ndr = {0}, acl_ndr = {0}; struct xattr_ntacl acl = {0}; struct xattr_smb_acl *smb_acl, *def_smb_acl = NULL; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); acl.version = 4; @@ -1432,7 +1484,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_vfs_setxattr(idmap, dentry, + rc = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_SD, sd_ndr.data, sd_ndr.offset, 0); if (rc < 0) @@ -1522,7 +1574,7 @@ free_n_data: } int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct xattr_dos_attrib *da) { struct ndr n; @@ -1532,7 +1584,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, if (err) return err; - err = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_DOS_ATTRIBUTE, (void *)n.data, n.offset, 0); if (err) ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); @@ -1769,10 +1821,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry) + struct path *path) { struct posix_acl_state acl_state; struct posix_acl *acls; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); int rc; @@ -1802,6 +1855,11 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); + + rc = mnt_want_write(path->mnt); + if (rc) + goto out_err; + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1813,16 +1871,20 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } + mnt_drop_write(path->mnt); + +out_err: free_acl_state(&acl_state); posix_acl_release(acls); return rc; } int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *parent_inode) + struct path *path, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); int rc, i; @@ -1841,6 +1903,10 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, } } + rc = mnt_want_write(path->mnt); + if (rc) + goto out_err; + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1852,6 +1918,9 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } + mnt_drop_write(path->mnt); + +out_err: posix_acl_release(acls); return rc; } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index a4ae89f3230d..8c0931d4d531 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -108,12 +108,12 @@ ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len); int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *attr_name, + const struct path *path, const char *attr_name, void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, char *attr_name); + const struct path *path, char *attr_name); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, bool caseless); @@ -139,26 +139,25 @@ void ksmbd_vfs_posix_lock_wait(struct file_lock *flock); int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout); void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock); int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry); -int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry); + const struct path *path); +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path); int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct smb_ntsd *pntsd, int len); int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd); int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct xattr_dos_attrib *da); int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry); + struct path *path); int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry, + struct path *path, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 2d0138e72d78..f41f8d6108ce 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -252,7 +252,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) { ci->m_flags &= ~S_DEL_ON_CLS_STREAM; err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), - filp->f_path.dentry, + &filp->f_path, fp->stream.name); if (err) pr_err("remove xattr failed : %s\n", diff --git a/fs/super.c b/fs/super.c index 34afe411cf2b..48c29954d487 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, @@ -595,7 +595,7 @@ retry: fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; - strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); @@ -674,7 +674,7 @@ retry: return ERR_PTR(err); } s->s_type = type; - strlcpy(s->s_id, type->name, sizeof(s->s_id)); + strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); @@ -903,6 +903,7 @@ int reconfigure_super(struct fs_context *fc) struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; + bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) @@ -920,7 +921,7 @@ int reconfigure_super(struct fs_context *fc) bdev_read_only(sb->s_bdev)) return -EACCES; #endif - + remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } @@ -943,13 +944,18 @@ int reconfigure_super(struct fs_context *fc) */ if (remount_ro) { if (force) { - sb->s_readonly_remount = 1; - smp_wmb(); + sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } + } else if (remount_rw) { + /* + * Protect filesystem's reconfigure code from writes from + * userspace until reconfigure finishes. + */ + sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { @@ -965,9 +971,7 @@ int reconfigure_super(struct fs_context *fc) WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); - /* Needs to be ordered wrt mnt_is_readonly() */ - smp_wmb(); - sb->s_readonly_remount = 0; + sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the @@ -982,7 +986,7 @@ int reconfigure_super(struct fs_context *fc) return 0; cancel_readonly: - sb->s_readonly_remount = 0; + sb_end_ro_state_change(sb); return retval; } diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index cdb3d632c63d..0140010aa0c3 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -52,7 +52,7 @@ static int sysv_handle_dirsync(struct inode *dir) } /* - * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the + * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the * rules documented in mm/highmem.rst. * * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() @@ -103,11 +103,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return 0; } } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } return 0; } @@ -131,7 +131,7 @@ static inline int namecompare(int len, int maxlen, * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * - * On Success put_and_unmap_page() should be called on *res_page. + * On Success unmap_and_put_page() should be called on *res_page. * * sysv_find_entry() acts as a call to dir_get_page() and must be treated * accordingly for nesting purposes. @@ -166,7 +166,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } if (++n >= npages) @@ -209,7 +209,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto out_page; de++; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } BUG(); return -EINVAL; @@ -228,7 +228,7 @@ got_it: mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return err; out_unlock: unlock_page(page); @@ -321,12 +321,12 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } return 1; not_empty: - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return 0; } @@ -352,7 +352,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } /* - * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the + * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the * rules documented in mm/highmem.rst. * * sysv_dotdot() acts as a call to dir_get_page() and must be treated @@ -376,7 +376,7 @@ ino_t sysv_inode_by_name(struct dentry *dentry) if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - put_and_unmap_page(page, de); + unmap_and_put_page(page, de); } return res; } diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index b22764fe669c..58d7f43a1371 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -145,6 +145,10 @@ static int alloc_branch(struct inode *inode, */ parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key); bh = sb_getblk(inode->i_sb, parent); + if (!bh) { + sysv_free_block(inode->i_sb, branch[n].key); + break; + } lock_buffer(bh); memset(bh->b_data, 0, blocksize); branch[n].bh = bh; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 2b2dba4c4f56..fcf163fea3ad 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -164,7 +164,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); } - put_and_unmap_page(page, de); + unmap_and_put_page(page, de); return err; } @@ -227,7 +227,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!new_de) goto out_dir; err = sysv_set_link(new_de, new_page, old_inode); - put_and_unmap_page(new_page, new_de); + unmap_and_put_page(new_page, new_de); if (err) goto out_dir; new_inode->i_ctime = current_time(new_inode); @@ -256,9 +256,9 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, out_dir: if (dir_de) - put_and_unmap_page(dir_page, dir_de); + unmap_and_put_page(dir_page, dir_de); out_old: - put_and_unmap_page(old_page, old_de); + unmap_and_put_page(old_page, old_de); out: return err; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index fd20423d3ed2..fd29a66e7241 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -793,11 +793,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!empty_dir(new_inode)) goto out_oiter; } - /* - * We need to protect against old_inode getting converted from - * ICB to normal directory. - */ - inode_lock_nested(old_inode, I_MUTEX_NONDIR2); retval = udf_fiiter_find_entry(old_inode, &dotdot_name, &diriter); if (retval == -ENOENT) { @@ -806,10 +801,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, old_inode->i_ino); retval = -EFSCORRUPTED; } - if (retval) { - inode_unlock(old_inode); + if (retval) goto out_oiter; - } has_diriter = true; tloc = lelb_to_cpu(diriter.fi.icb.extLocation); if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != @@ -889,7 +882,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, udf_dir_entry_len(&diriter.fi)); udf_fiiter_write_fi(&diriter, NULL); udf_fiiter_release(&diriter); - inode_unlock(old_inode); inode_dec_link_count(old_dir); if (new_inode) @@ -901,10 +893,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, } return 0; out_oiter: - if (has_diriter) { + if (has_diriter) udf_fiiter_release(&diriter); - inode_unlock(old_inode); - } udf_fiiter_release(&oiter); return retval; diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index a7ffd718f171..e1036e535352 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -39,14 +39,14 @@ config FS_VERITY_BUILTIN_SIGNATURES depends on FS_VERITY select SYSTEM_DATA_VERIFICATION help - Support verifying signatures of verity files against the X.509 - certificates that have been loaded into the ".fs-verity" - kernel keyring. + This option adds support for in-kernel verification of + fs-verity builtin signatures. - This is meant as a relatively simple mechanism that can be - used to provide an authenticity guarantee for verity files, as - an alternative to IMA appraisal. Userspace programs still - need to check that the verity bit is set in order to get an - authenticity guarantee. + Please take great care before using this feature. It is not + the only way to do signatures with fs-verity, and the + alternatives (such as userspace signature verification, and + IMA appraisal) can be much better. For details about the + limitations of this feature, see + Documentation/filesystems/fsverity.rst. If unsure, say N. diff --git a/fs/verity/enable.c b/fs/verity/enable.c index fc4c50e5219d..c284f46d1b53 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -7,6 +7,7 @@ #include "fsverity_private.h" +#include <crypto/hash.h> #include <linux/mount.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> @@ -20,7 +21,7 @@ struct block_buffer { /* Hash a block, writing the result to the next level's pending block buffer. */ static int hash_one_block(struct inode *inode, const struct merkle_tree_params *params, - struct ahash_request *req, struct block_buffer *cur) + struct block_buffer *cur) { struct block_buffer *next = cur + 1; int err; @@ -36,8 +37,7 @@ static int hash_one_block(struct inode *inode, /* Zero-pad the block if it's shorter than the block size. */ memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - err = fsverity_hash_block(params, inode, req, virt_to_page(cur->data), - offset_in_page(cur->data), + err = fsverity_hash_block(params, inode, cur->data, &next->data[next->filled]); if (err) return err; @@ -76,7 +76,6 @@ static int build_merkle_tree(struct file *filp, struct inode *inode = file_inode(filp); const u64 data_size = inode->i_size; const int num_levels = params->num_levels; - struct ahash_request *req; struct block_buffer _buffers[1 + FS_VERITY_MAX_LEVELS + 1] = {}; struct block_buffer *buffers = &_buffers[1]; unsigned long level_offset[FS_VERITY_MAX_LEVELS]; @@ -90,9 +89,6 @@ static int build_merkle_tree(struct file *filp, return 0; } - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL); - /* * Allocate the block buffers. Buffer "-1" is for data blocks. * Buffers 0 <= level < num_levels are for the actual tree levels. @@ -130,7 +126,7 @@ static int build_merkle_tree(struct file *filp, fsverity_err(inode, "Short read of file data"); goto out; } - err = hash_one_block(inode, params, req, &buffers[-1]); + err = hash_one_block(inode, params, &buffers[-1]); if (err) goto out; for (level = 0; level < num_levels; level++) { @@ -141,8 +137,7 @@ static int build_merkle_tree(struct file *filp, } /* Next block at @level is full */ - err = hash_one_block(inode, params, req, - &buffers[level]); + err = hash_one_block(inode, params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -162,8 +157,7 @@ static int build_merkle_tree(struct file *filp, /* Finish all nonempty pending tree blocks. */ for (level = 0; level < num_levels; level++) { if (buffers[level].filled != 0) { - err = hash_one_block(inode, params, req, - &buffers[level]); + err = hash_one_block(inode, params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -183,7 +177,6 @@ static int build_merkle_tree(struct file *filp, out: for (level = -1; level < num_levels; level++) kfree(buffers[level].data); - fsverity_free_hash_request(params->hash_alg, req); return err; } @@ -215,7 +208,7 @@ static int enable_verity(struct file *filp, } desc->salt_size = arg->salt_size; - /* Get the signature if the user provided one */ + /* Get the builtin signature if the user provided one */ if (arg->sig_size && copy_from_user(desc->signature, u64_to_user_ptr(arg->sig_ptr), arg->sig_size)) { diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index d34dcc033d72..49bf3a1eb2a0 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -11,9 +11,6 @@ #define pr_fmt(fmt) "fs-verity: " fmt #include <linux/fsverity.h> -#include <linux/mempool.h> - -struct ahash_request; /* * Implementation limit: maximum depth of the Merkle tree. For now 8 is plenty; @@ -23,11 +20,10 @@ struct ahash_request; /* A hash algorithm supported by fs-verity */ struct fsverity_hash_alg { - struct crypto_ahash *tfm; /* hash tfm, allocated on demand */ + struct crypto_shash *tfm; /* hash tfm, allocated on demand */ const char *name; /* crypto API name, e.g. sha256 */ unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ - mempool_t req_pool; /* mempool with a preallocated hash request */ /* * The HASH_ALGO_* constant for this algorithm. This is different from * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme. @@ -37,7 +33,7 @@ struct fsverity_hash_alg { /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ struct merkle_tree_params { - struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ + const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ const u8 *hashstate; /* initial hash state or NULL */ unsigned int digest_size; /* same as hash_alg->digest_size */ unsigned int block_size; /* size of data and tree blocks */ @@ -83,18 +79,13 @@ struct fsverity_info { extern struct fsverity_hash_alg fsverity_hash_algs[]; -struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, - unsigned int num); -struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg, - gfp_t gfp_flags); -void fsverity_free_hash_request(struct fsverity_hash_alg *alg, - struct ahash_request *req); -const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num); +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); int fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, struct ahash_request *req, - struct page *page, unsigned int offset, u8 *out); -int fsverity_hash_buffer(struct fsverity_hash_alg *alg, + const struct inode *inode, const void *data, u8 *out); +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index ea00dbedf756..c598d2035476 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -8,7 +8,6 @@ #include "fsverity_private.h" #include <crypto/hash.h> -#include <linux/scatterlist.h> /* The hash algorithms supported by fs-verity */ struct fsverity_hash_alg fsverity_hash_algs[] = { @@ -40,11 +39,11 @@ static DEFINE_MUTEX(fsverity_hash_alg_init_mutex); * * Return: pointer to the hash alg on success, else an ERR_PTR() */ -struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, - unsigned int num) +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num) { struct fsverity_hash_alg *alg; - struct crypto_ahash *tfm; + struct crypto_shash *tfm; int err; if (num >= ARRAY_SIZE(fsverity_hash_algs) || @@ -63,11 +62,7 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, if (alg->tfm != NULL) goto out_unlock; - /* - * Using the shash API would make things a bit simpler, but the ahash - * API is preferable as it allows the use of crypto accelerators. - */ - tfm = crypto_alloc_ahash(alg->name, 0, 0); + tfm = crypto_alloc_shash(alg->name, 0, 0); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fsverity_warn(inode, @@ -84,26 +79,20 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, } err = -EINVAL; - if (WARN_ON_ONCE(alg->digest_size != crypto_ahash_digestsize(tfm))) + if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm))) goto err_free_tfm; - if (WARN_ON_ONCE(alg->block_size != crypto_ahash_blocksize(tfm))) - goto err_free_tfm; - - err = mempool_init_kmalloc_pool(&alg->req_pool, 1, - sizeof(struct ahash_request) + - crypto_ahash_reqsize(tfm)); - if (err) + if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm))) goto err_free_tfm; pr_info("%s using implementation \"%s\"\n", - alg->name, crypto_ahash_driver_name(tfm)); + alg->name, crypto_shash_driver_name(tfm)); /* pairs with smp_load_acquire() above */ smp_store_release(&alg->tfm, tfm); goto out_unlock; err_free_tfm: - crypto_free_ahash(tfm); + crypto_free_shash(tfm); alg = ERR_PTR(err); out_unlock: mutex_unlock(&fsverity_hash_alg_init_mutex); @@ -111,42 +100,6 @@ out_unlock: } /** - * fsverity_alloc_hash_request() - allocate a hash request object - * @alg: the hash algorithm for which to allocate the request - * @gfp_flags: memory allocation flags - * - * This is mempool-backed, so this never fails if __GFP_DIRECT_RECLAIM is set in - * @gfp_flags. However, in that case this might need to wait for all - * previously-allocated requests to be freed. So to avoid deadlocks, callers - * must never need multiple requests at a time to make forward progress. - * - * Return: the request object on success; NULL on failure (but see above) - */ -struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg, - gfp_t gfp_flags) -{ - struct ahash_request *req = mempool_alloc(&alg->req_pool, gfp_flags); - - if (req) - ahash_request_set_tfm(req, alg->tfm); - return req; -} - -/** - * fsverity_free_hash_request() - free a hash request object - * @alg: the hash algorithm - * @req: the hash request object to free - */ -void fsverity_free_hash_request(struct fsverity_hash_alg *alg, - struct ahash_request *req) -{ - if (req) { - ahash_request_zero(req); - mempool_free(req, &alg->req_pool); - } -} - -/** * fsverity_prepare_hash_state() - precompute the initial hash state * @alg: hash algorithm * @salt: a salt which is to be prepended to all data to be hashed @@ -155,27 +108,24 @@ void fsverity_free_hash_request(struct fsverity_hash_alg *alg, * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed * initial hash state on success or an ERR_PTR() on failure. */ -const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size) { u8 *hashstate = NULL; - struct ahash_request *req = NULL; + SHASH_DESC_ON_STACK(desc, alg->tfm); u8 *padded_salt = NULL; size_t padded_salt_size; - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); int err; + desc->tfm = alg->tfm; + if (salt_size == 0) return NULL; - hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL); + hashstate = kmalloc(crypto_shash_statesize(alg->tfm), GFP_KERNEL); if (!hashstate) return ERR_PTR(-ENOMEM); - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(alg, GFP_KERNEL); - /* * Zero-pad the salt to the next multiple of the input size of the hash * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128 @@ -190,26 +140,18 @@ const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, goto err_free; } memcpy(padded_salt, salt, salt_size); - - sg_init_one(&sg, padded_salt, padded_salt_size); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, NULL, padded_salt_size); - - err = crypto_wait_req(crypto_ahash_init(req), &wait); + err = crypto_shash_init(desc); if (err) goto err_free; - err = crypto_wait_req(crypto_ahash_update(req), &wait); + err = crypto_shash_update(desc, padded_salt, padded_salt_size); if (err) goto err_free; - err = crypto_ahash_export(req, hashstate); + err = crypto_shash_export(desc, hashstate); if (err) goto err_free; out: - fsverity_free_hash_request(alg, req); kfree(padded_salt); return hashstate; @@ -223,9 +165,7 @@ err_free: * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters * @inode: inode for which the hashing is being done - * @req: preallocated hash request - * @page: the page containing the block to hash - * @offset: the offset of the block within @page + * @data: virtual address of a buffer containing the block to hash * @out: output digest, size 'params->digest_size' bytes * * Hash a single data or hash block. The hash is salted if a salt is specified @@ -234,33 +174,24 @@ err_free: * Return: 0 on success, -errno on failure */ int fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, struct ahash_request *req, - struct page *page, unsigned int offset, u8 *out) + const struct inode *inode, const void *data, u8 *out) { - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); + SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm); int err; - sg_init_table(&sg, 1); - sg_set_page(&sg, page, params->block_size, offset); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, params->block_size); + desc->tfm = params->hash_alg->tfm; if (params->hashstate) { - err = crypto_ahash_import(req, params->hashstate); + err = crypto_shash_import(desc, params->hashstate); if (err) { fsverity_err(inode, "Error %d importing hash state", err); return err; } - err = crypto_ahash_finup(req); + err = crypto_shash_finup(desc, data, params->block_size, out); } else { - err = crypto_ahash_digest(req); + err = crypto_shash_digest(desc, data, params->block_size, out); } - - err = crypto_wait_req(err, &wait); if (err) fsverity_err(inode, "Error %d computing block hash", err); return err; @@ -273,32 +204,12 @@ int fsverity_hash_block(const struct merkle_tree_params *params, * @size: size of data to hash, in bytes * @out: output digest, size 'alg->digest_size' bytes * - * Hash some data which is located in physically contiguous memory (i.e. memory - * allocated by kmalloc(), not by vmalloc()). No salt is used. - * * Return: 0 on success, -errno on failure */ -int fsverity_hash_buffer(struct fsverity_hash_alg *alg, +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out) { - struct ahash_request *req; - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); - int err; - - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(alg, GFP_KERNEL); - - sg_init_one(&sg, data, size); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, size); - - err = crypto_wait_req(crypto_ahash_digest(req), &wait); - - fsverity_free_hash_request(alg, req); - return err; + return crypto_shash_tfm_digest(alg->tfm, data, size, out); } void __init fsverity_check_hash_algs(void) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 5c79ea1b2468..eec5956141da 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -61,27 +61,42 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); /** * fsverity_get_digest() - get a verity file's digest * @inode: inode to get digest of - * @digest: (out) pointer to the digest - * @alg: (out) pointer to the hash algorithm enumeration + * @raw_digest: (out) the raw file digest + * @alg: (out) the digest's algorithm, as a FS_VERITY_HASH_ALG_* value + * @halg: (out) the digest's algorithm, as a HASH_ALGO_* value * - * Return the file hash algorithm and digest of an fsverity protected file. - * Assumption: before calling this, the file must have been opened. + * Retrieves the fsverity digest of the given file. The file must have been + * opened at least once since the inode was last loaded into the inode cache; + * otherwise this function will not recognize when fsverity is enabled. * - * Return: 0 on success, -errno on failure + * The file's fsverity digest consists of @raw_digest in combination with either + * @alg or @halg. (The caller can choose which one of @alg or @halg to use.) + * + * IMPORTANT: Callers *must* make use of one of the two algorithm IDs, since + * @raw_digest is meaningless without knowing which algorithm it uses! fsverity + * provides no security guarantee for users who ignore the algorithm ID, even if + * they use the digest size (since algorithms can share the same digest size). + * + * Return: The size of the raw digest in bytes, or 0 if the file doesn't have + * fsverity enabled. */ int fsverity_get_digest(struct inode *inode, - u8 digest[FS_VERITY_MAX_DIGEST_SIZE], - enum hash_algo *alg) + u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], + u8 *alg, enum hash_algo *halg) { const struct fsverity_info *vi; const struct fsverity_hash_alg *hash_alg; vi = fsverity_get_info(inode); if (!vi) - return -ENODATA; /* not a verity file */ + return 0; /* not a verity file */ hash_alg = vi->tree_params.hash_alg; - memcpy(digest, vi->file_digest, hash_alg->digest_size); - *alg = hash_alg->algo_id; - return 0; + memcpy(raw_digest, vi->file_digest, hash_alg->digest_size); + if (alg) + *alg = hash_alg - fsverity_hash_algs; + if (halg) + *halg = hash_alg->algo_id; + return hash_alg->digest_size; } +EXPORT_SYMBOL_GPL(fsverity_get_digest); diff --git a/fs/verity/open.c b/fs/verity/open.c index 52048b7630dc..1db5106a9c38 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -32,7 +32,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, unsigned int log_blocksize, const u8 *salt, size_t salt_size) { - struct fsverity_hash_alg *hash_alg; + const struct fsverity_hash_alg *hash_alg; int err; u64 blocks; u64 blocks_in_level[FS_VERITY_MAX_LEVELS]; @@ -156,9 +156,9 @@ out_err: /* * Compute the file digest by hashing the fsverity_descriptor excluding the - * signature and with the sig_size field set to 0. + * builtin signature and with the sig_size field set to 0. */ -static int compute_file_digest(struct fsverity_hash_alg *hash_alg, +static int compute_file_digest(const struct fsverity_hash_alg *hash_alg, struct fsverity_descriptor *desc, u8 *file_digest) { @@ -174,7 +174,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, /* * Create a new fsverity_info from the given fsverity_descriptor (with optional - * appended signature), and check the signature if present. The + * appended builtin signature), and check the signature if present. The * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, @@ -319,8 +319,8 @@ static bool validate_fsverity_descriptor(struct inode *inode, } /* - * Read the inode's fsverity_descriptor (with optional appended signature) from - * the filesystem, and do basic validation of it. + * Read the inode's fsverity_descriptor (with optional appended builtin + * signature) from the filesystem, and do basic validation of it. */ int fsverity_get_descriptor(struct inode *inode, struct fsverity_descriptor **desc_ret) diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 2aefc5565152..f58432772d9e 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -105,7 +105,7 @@ static int fsverity_read_descriptor(struct inode *inode, if (res) return res; - /* don't include the signature */ + /* don't include the builtin signature */ desc_size = offsetof(struct fsverity_descriptor, signature); desc->sig_size = 0; @@ -131,7 +131,7 @@ static int fsverity_read_signature(struct inode *inode, } /* - * Include only the signature. Note that fsverity_get_descriptor() + * Include only the builtin signature. fsverity_get_descriptor() * already verified that sig_size is in-bounds. */ res = fsverity_read_buffer(buf, offset, length, desc->signature, diff --git a/fs/verity/signature.c b/fs/verity/signature.c index b8c51ad40d3a..72034bc71c9d 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -5,6 +5,14 @@ * Copyright 2019 Google LLC */ +/* + * This file implements verification of fs-verity builtin signatures. Please + * take great care before using this feature. It is not the only way to do + * signatures with fs-verity, and the alternatives (such as userspace signature + * verification, and IMA appraisal) can be much better. For details about the + * limitations of this feature, see Documentation/filesystems/fsverity.rst. + */ + #include "fsverity_private.h" #include <linux/cred.h> diff --git a/fs/verity/verify.c b/fs/verity/verify.c index e2508222750b..433cef51f5f6 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -12,38 +12,6 @@ static struct workqueue_struct *fsverity_read_workqueue; -static inline int cmp_hashes(const struct fsverity_info *vi, - const u8 *want_hash, const u8 *real_hash, - u64 data_pos, int level) -{ - const unsigned int hsize = vi->tree_params.digest_size; - - if (memcmp(want_hash, real_hash, hsize) == 0) - return 0; - - fsverity_err(vi->inode, - "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - data_pos, level, - vi->tree_params.hash_alg->name, hsize, want_hash, - vi->tree_params.hash_alg->name, hsize, real_hash); - return -EBADMSG; -} - -static bool data_is_zeroed(struct inode *inode, struct page *page, - unsigned int len, unsigned int offset) -{ - void *virt = kmap_local_page(page); - - if (memchr_inv(virt + offset, 0, len)) { - kunmap_local(virt); - fsverity_err(inode, - "FILE CORRUPTED! Data past EOF is not zeroed"); - return false; - } - kunmap_local(virt); - return true; -} - /* * Returns true if the hash block with index @hblock_idx in the tree, located in * @hpage, has already been verified. @@ -122,9 +90,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, */ static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, - struct ahash_request *req, struct page *data_page, - u64 data_pos, unsigned int dblock_offset_in_page, - unsigned long max_ra_pages) + const void *data, u64 data_pos, unsigned long max_ra_pages) { const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; @@ -136,11 +102,11 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, struct { /* Page containing the hash block */ struct page *page; + /* Mapped address of the hash block (will be within @page) */ + const void *addr; /* Index of the hash block in the tree overall */ unsigned long index; - /* Byte offset of the hash block within @page */ - unsigned int offset_in_page; - /* Byte offset of the wanted hash within @page */ + /* Byte offset of the wanted hash relative to @addr */ unsigned int hoffset; } hblocks[FS_VERITY_MAX_LEVELS]; /* @@ -148,7 +114,9 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * index of that block's hash within the current level. */ u64 hidx = data_pos >> params->log_blocksize; - int err; + + /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */ + BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); if (unlikely(data_pos >= inode->i_size)) { /* @@ -159,8 +127,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * any part past EOF should be all zeroes. Therefore, we need * to verify that any data blocks fully past EOF are all zeroes. */ - return data_is_zeroed(inode, data_page, params->block_size, - dblock_offset_in_page); + if (memchr_inv(data, 0, params->block_size)) { + fsverity_err(inode, + "FILE CORRUPTED! Data past EOF is not zeroed"); + return false; + } + return true; } /* @@ -175,6 +147,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, unsigned int hblock_offset_in_page; unsigned int hoffset; struct page *hpage; + const void *haddr; /* * The index of the block in the current level; also the index @@ -192,30 +165,30 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, hblock_offset_in_page = (hblock_idx << params->log_blocksize) & ~PAGE_MASK; - /* Byte offset of the hash within the page */ - hoffset = hblock_offset_in_page + - ((hidx << params->log_digestsize) & - (params->block_size - 1)); + /* Byte offset of the hash within the block */ + hoffset = (hidx << params->log_digestsize) & + (params->block_size - 1); hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hpage_idx, level == 0 ? min(max_ra_pages, params->tree_pages - hpage_idx) : 0); if (IS_ERR(hpage)) { - err = PTR_ERR(hpage); fsverity_err(inode, - "Error %d reading Merkle tree page %lu", - err, hpage_idx); - goto out; + "Error %ld reading Merkle tree page %lu", + PTR_ERR(hpage), hpage_idx); + goto error; } + haddr = kmap_local_page(hpage) + hblock_offset_in_page; if (is_hash_block_verified(vi, hpage, hblock_idx)) { - memcpy_from_page(_want_hash, hpage, hoffset, hsize); + memcpy(_want_hash, haddr + hoffset, hsize); want_hash = _want_hash; + kunmap_local(haddr); put_page(hpage); goto descend; } hblocks[level].page = hpage; + hblocks[level].addr = haddr; hblocks[level].index = hblock_idx; - hblocks[level].offset_in_page = hblock_offset_in_page; hblocks[level].hoffset = hoffset; hidx = next_hidx; } @@ -225,18 +198,14 @@ descend: /* Descend the tree verifying hash blocks. */ for (; level > 0; level--) { struct page *hpage = hblocks[level - 1].page; + const void *haddr = hblocks[level - 1].addr; unsigned long hblock_idx = hblocks[level - 1].index; - unsigned int hblock_offset_in_page = - hblocks[level - 1].offset_in_page; unsigned int hoffset = hblocks[level - 1].hoffset; - err = fsverity_hash_block(params, inode, req, hpage, - hblock_offset_in_page, real_hash); - if (err) - goto out; - err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1); - if (err) - goto out; + if (fsverity_hash_block(params, inode, haddr, real_hash) != 0) + goto error; + if (memcmp(want_hash, real_hash, hsize) != 0) + goto corrupted; /* * Mark the hash block as verified. This must be atomic and * idempotent, as the same hash block might be verified by @@ -246,29 +215,39 @@ descend: set_bit(hblock_idx, vi->hash_block_verified); else SetPageChecked(hpage); - memcpy_from_page(_want_hash, hpage, hoffset, hsize); + memcpy(_want_hash, haddr + hoffset, hsize); want_hash = _want_hash; + kunmap_local(haddr); put_page(hpage); } /* Finally, verify the data block. */ - err = fsverity_hash_block(params, inode, req, data_page, - dblock_offset_in_page, real_hash); - if (err) - goto out; - err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1); -out: - for (; level > 0; level--) - put_page(hblocks[level - 1].page); + if (fsverity_hash_block(params, inode, data, real_hash) != 0) + goto error; + if (memcmp(want_hash, real_hash, hsize) != 0) + goto corrupted; + return true; - return err == 0; +corrupted: + fsverity_err(inode, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level - 1, + params->hash_alg->name, hsize, want_hash, + params->hash_alg->name, hsize, real_hash); +error: + for (; level > 0; level--) { + kunmap_local(hblocks[level - 1].addr); + put_page(hblocks[level - 1].page); + } + return false; } static bool -verify_data_blocks(struct inode *inode, struct fsverity_info *vi, - struct ahash_request *req, struct folio *data_folio, - size_t len, size_t offset, unsigned long max_ra_pages) +verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, + unsigned long max_ra_pages) { + struct inode *inode = data_folio->mapping->host; + struct fsverity_info *vi = inode->i_verity_info; const unsigned int block_size = vi->tree_params.block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; @@ -278,11 +257,14 @@ verify_data_blocks(struct inode *inode, struct fsverity_info *vi, folio_test_uptodate(data_folio))) return false; do { - struct page *data_page = - folio_page(data_folio, offset >> PAGE_SHIFT); - - if (!verify_data_block(inode, vi, req, data_page, pos + offset, - offset & ~PAGE_MASK, max_ra_pages)) + void *data; + bool valid; + + data = kmap_local_folio(data_folio, offset); + valid = verify_data_block(inode, vi, data, pos + offset, + max_ra_pages); + kunmap_local(data); + if (!valid) return false; offset += block_size; len -= block_size; @@ -304,19 +286,7 @@ verify_data_blocks(struct inode *inode, struct fsverity_info *vi, */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - struct inode *inode = folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - struct ahash_request *req; - bool valid; - - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - - valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0); - - fsverity_free_hash_request(vi->tree_params.hash_alg, req); - - return valid; + return verify_data_blocks(folio, len, offset, 0); } EXPORT_SYMBOL_GPL(fsverity_verify_blocks); @@ -337,15 +307,9 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks); */ void fsverity_verify_bio(struct bio *bio) { - struct inode *inode = bio_first_page_all(bio)->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - struct ahash_request *req; struct folio_iter fi; unsigned long max_ra_pages = 0; - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - if (bio->bi_opf & REQ_RAHEAD) { /* * If this bio is for data readahead, then we also do readahead @@ -360,14 +324,12 @@ void fsverity_verify_bio(struct bio *bio) } bio_for_each_folio_all(fi, bio) { - if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length, - fi.offset, max_ra_pages)) { + if (!verify_data_blocks(fi.folio, fi.length, fi.offset, + max_ra_pages)) { bio->bi_status = BLK_STS_IOERR; break; } } - - fsverity_free_hash_request(vi->tree_params.hash_alg, req); } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 132f01d3461f..d11b08d5e4d1 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -181,7 +181,6 @@ const struct address_space_operations zonefs_file_aops = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, - .direct_IO = noop_direct_IO, .swap_activate = zonefs_swap_activate, }; @@ -342,6 +341,77 @@ static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek_size(file, offset, whence, isize, isize); } +struct zonefs_zone_append_bio { + /* The target inode of the BIO */ + struct inode *inode; + + /* For sync writes, the target append write offset */ + u64 append_offset; + + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire zonefs_bio but relies on bio being last. + */ + struct bio bio; +}; + +static inline struct zonefs_zone_append_bio * +zonefs_zone_append_bio(struct bio *bio) +{ + return container_of(bio, struct zonefs_zone_append_bio, bio); +} + +static void zonefs_file_zone_append_dio_bio_end_io(struct bio *bio) +{ + struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio); + struct zonefs_zone *z = zonefs_inode_zone(za_bio->inode); + sector_t za_sector; + + if (bio->bi_status != BLK_STS_OK) + goto bio_end; + + /* + * If the file zone was written underneath the file system, the zone + * append operation can still succedd (if the zone is not full) but + * the write append location will not be where we expect it to be. + * Check that we wrote where we intended to, that is, at z->z_wpoffset. + */ + za_sector = z->z_sector + (za_bio->append_offset >> SECTOR_SHIFT); + if (bio->bi_iter.bi_sector != za_sector) { + zonefs_warn(za_bio->inode->i_sb, + "Invalid write sector %llu for zone at %llu\n", + bio->bi_iter.bi_sector, z->z_sector); + bio->bi_status = BLK_STS_IOERR; + } + +bio_end: + iomap_dio_bio_end_io(bio); +} + +static void zonefs_file_zone_append_dio_submit_io(const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio); + struct inode *inode = iter->inode; + struct zonefs_zone *z = zonefs_inode_zone(inode); + + /* + * Issue a zone append BIO to process sync dio writes. The append + * file offset is saved to check the zone append write location + * on completion of the BIO. + */ + za_bio->inode = inode; + za_bio->append_offset = file_offset; + + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + bio->bi_iter.bi_sector = z->z_sector; + bio->bi_end_io = zonefs_file_zone_append_dio_bio_end_io; + + submit_bio(bio); +} + static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { @@ -372,93 +442,17 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, return 0; } -static const struct iomap_dio_ops zonefs_write_dio_ops = { - .end_io = zonefs_file_write_dio_end_io, -}; +static struct bio_set zonefs_zone_append_bio_set; -static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_zone *z = zonefs_inode_zone(inode); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max = bdev_max_zone_append_sectors(bdev); - pgoff_t start, end; - struct bio *bio; - ssize_t size = 0; - int nr_pages; - ssize_t ret; - - max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); - iov_iter_truncate(from, max); - - /* - * If the inode block size (zone write granularity) is smaller than the - * page size, we may be appending data belonging to the last page of the - * inode straddling inode->i_size, with that page already cached due to - * a buffered read or readahead. So make sure to invalidate that page. - * This will always be a no-op for the case where the block size is - * equal to the page size. - */ - start = iocb->ki_pos >> PAGE_SHIFT; - end = (iocb->ki_pos + iov_iter_count(from) - 1) >> PAGE_SHIFT; - if (invalidate_inode_pages2_range(inode->i_mapping, start, end)) - return -EBUSY; - - nr_pages = iov_iter_npages(from, BIO_MAX_VECS); - if (!nr_pages) - return 0; - - bio = bio_alloc(bdev, nr_pages, - REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); - bio->bi_iter.bi_sector = z->z_sector; - bio->bi_ioprio = iocb->ki_ioprio; - if (iocb_is_dsync(iocb)) - bio->bi_opf |= REQ_FUA; - - ret = bio_iov_iter_get_pages(bio, from); - if (unlikely(ret)) - goto out_release; - - size = bio->bi_iter.bi_size; - task_io_account_write(size); - - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(bio, iocb); - - ret = submit_bio_wait(bio); - - /* - * If the file zone was written underneath the file system, the zone - * write pointer may not be where we expect it to be, but the zone - * append write can still succeed. So check manually that we wrote where - * we intended to, that is, at zi->i_wpoffset. - */ - if (!ret) { - sector_t wpsector = - z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT); - - if (bio->bi_iter.bi_sector != wpsector) { - zonefs_warn(inode->i_sb, - "Corrupted write pointer %llu for zone at %llu\n", - bio->bi_iter.bi_sector, z->z_sector); - ret = -EIO; - } - } - - zonefs_file_write_dio_end_io(iocb, size, ret, 0); - trace_zonefs_file_dio_append(inode, size, ret); - -out_release: - bio_release_pages(bio, false); - bio_put(bio); - - if (ret >= 0) { - iocb->ki_pos += size; - return size; - } +static const struct iomap_dio_ops zonefs_zone_append_dio_ops = { + .submit_io = zonefs_file_zone_append_dio_submit_io, + .end_io = zonefs_file_write_dio_end_io, + .bio_set = &zonefs_zone_append_bio_set, +}; - return ret; -} +static const struct iomap_dio_ops zonefs_write_dio_ops = { + .end_io = zonefs_file_write_dio_end_io, +}; /* * Do not exceed the LFS limits nor the file zone size. If pos is under the @@ -539,6 +533,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) struct zonefs_inode_info *zi = ZONEFS_I(inode); struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; + const struct iomap_dio_ops *dio_ops; bool sync = is_sync_kiocb(iocb); bool append = false; ssize_t ret, count; @@ -582,20 +577,26 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) } if (append) { - ret = zonefs_file_dio_append(iocb, from); + unsigned int max = bdev_max_zone_append_sectors(sb->s_bdev); + + max = ALIGN_DOWN(max << SECTOR_SHIFT, sb->s_blocksize); + iov_iter_truncate(from, max); + + dio_ops = &zonefs_zone_append_dio_ops; } else { - /* - * iomap_dio_rw() may return ENOTBLK if there was an issue with - * page invalidation. Overwrite that error code with EBUSY to - * be consistent with zonefs_file_dio_append() return value for - * similar issues. - */ - ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, - &zonefs_write_dio_ops, 0, NULL, 0); - if (ret == -ENOTBLK) - ret = -EBUSY; + dio_ops = &zonefs_write_dio_ops; } + /* + * iomap_dio_rw() may return ENOTBLK if there was an issue with + * page invalidation. Overwrite that error code with EBUSY so that + * the user can make sense of the error. + */ + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + dio_ops, 0, NULL, 0); + if (ret == -ENOTBLK) + ret = -EBUSY; + if (zonefs_zone_is_seq(z) && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) @@ -813,6 +814,7 @@ static int zonefs_file_open(struct inode *inode, struct file *file) { int ret; + file->f_mode |= FMODE_CAN_ODIRECT; ret = generic_file_open(inode, file); if (ret) return ret; @@ -900,3 +902,15 @@ const struct file_operations zonefs_file_operations = { .splice_write = iter_file_splice_write, .iopoll = iocb_bio_iopoll, }; + +int zonefs_file_bioset_init(void) +{ + return bioset_init(&zonefs_zone_append_bio_set, BIO_POOL_SIZE, + offsetof(struct zonefs_zone_append_bio, bio), + BIOSET_NEED_BVECS); +} + +void zonefs_file_bioset_exit(void) +{ + bioset_exit(&zonefs_zone_append_bio_set); +} diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 23b8b299c64e..56c00111966a 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1412,10 +1412,14 @@ static int __init zonefs_init(void) BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE); - ret = zonefs_init_inodecache(); + ret = zonefs_file_bioset_init(); if (ret) return ret; + ret = zonefs_init_inodecache(); + if (ret) + goto destroy_bioset; + ret = zonefs_sysfs_init(); if (ret) goto destroy_inodecache; @@ -1430,6 +1434,8 @@ sysfs_exit: zonefs_sysfs_exit(); destroy_inodecache: zonefs_destroy_inodecache(); +destroy_bioset: + zonefs_file_bioset_exit(); return ret; } @@ -1439,6 +1445,7 @@ static void __exit zonefs_exit(void) unregister_filesystem(&zonefs_type); zonefs_sysfs_exit(); zonefs_destroy_inodecache(); + zonefs_file_bioset_exit(); } MODULE_AUTHOR("Damien Le Moal"); diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 8175652241b5..f663b8ebc2cb 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -279,6 +279,8 @@ extern const struct file_operations zonefs_dir_operations; extern const struct address_space_operations zonefs_file_aops; extern const struct file_operations zonefs_file_operations; int zonefs_file_truncate(struct inode *inode, loff_t isize); +int zonefs_file_bioset_init(void); +void zonefs_file_bioset_exit(void); /* In sysfs.c */ int zonefs_sysfs_register(struct super_block *sb); diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index e6098a08c914..9ffdc0425bc2 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -761,6 +761,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_event_status *event_status)) ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number)) +ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_hw_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void)) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index cebdf1ca415d..da9e5629ea43 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -839,6 +839,9 @@ #ifdef CONFIG_UNWINDER_ORC #define ORC_UNWIND_TABLE \ + .orc_header : AT(ADDR(.orc_header) - LOAD_OFFSET) { \ + BOUNDED_SECTION_BY(.orc_header, _orc_header) \ + } \ . = ALIGN(4); \ .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.orc_unwind_ip, _orc_unwind_ip) \ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0f1001dca0e0..3ceb9dfa0993 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -200,6 +200,7 @@ enum cpuhp_state { /* Online section invoked on the hotplugged CPU from the hotplug thread */ CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_HYPERV_ONLINE, CPUHP_AP_KVM_ONLINE, CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 36a486505b08..b9d83652c097 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -9,12 +9,12 @@ #ifndef _LINUX_EVENTFD_H #define _LINUX_EVENTFD_H -#include <linux/fcntl.h> #include <linux/wait.h> #include <linux/err.h> #include <linux/percpu-defs.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <uapi/linux/eventfd.h> /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining @@ -23,10 +23,6 @@ * from eventfd, in order to leave a free define-space for * shared O_* flags. */ -#define EFD_SEMAPHORE (1 << 0) -#define EFD_CLOEXEC O_CLOEXEC -#define EFD_NONBLOCK O_NONBLOCK - #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) @@ -40,7 +36,7 @@ struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask); +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640fb24..4ca804f2b3ed 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -171,6 +171,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports non-exclusive O_DIRECT writes from multiple threads */ #define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000) +/* File is embedded in backing_file object */ +#define FMODE_BACKING ((__force fmode_t)0x2000000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) @@ -956,29 +959,35 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } +/* + * f_{lock,count,pos_lock} members can be highly contended and share + * the same cacheline. f_{lock,mode} are very frequently used together + * and so share the same cacheline as well. The read-mostly + * f_{path,inode,op} are kept on a separate cacheline. + */ struct file { union { struct llist_node f_llist; struct rcu_head f_rcuhead; unsigned int f_iocb_flags; }; - struct path f_path; - struct inode *f_inode; /* cached value */ - const struct file_operations *f_op; /* * Protects f_ep, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; - atomic_long_t f_count; - unsigned int f_flags; fmode_t f_mode; + atomic_long_t f_count; struct mutex f_pos_lock; loff_t f_pos; + unsigned int f_flags; struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra; + struct path f_path; + struct inode *f_inode; /* cached value */ + const struct file_operations *f_op; u64 f_version; #ifdef CONFIG_SECURITY @@ -1242,7 +1251,7 @@ struct super_block { */ atomic_long_t s_fsnotify_connectors; - /* Being remounted read-only */ + /* Read-only state of the superblock is being changed */ int s_readonly_remount; /* per-sb errseq_t for reporting writeback errors via syncfs */ @@ -1672,9 +1681,12 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap, WHITEOUT_DEV); } -struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, - const struct path *parentpath, - umode_t mode, int open_flag, const struct cred *cred); +struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, + const struct path *parentpath, + umode_t mode, int open_flag, + const struct cred *cred); +struct file *kernel_file_open(const struct path *path, int flags, + struct inode *inode, const struct cred *cred); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), @@ -2349,11 +2361,31 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, name, flags, mode); } -extern struct file * dentry_open(const struct path *, int, const struct cred *); -extern struct file *dentry_create(const struct path *path, int flags, - umode_t mode, const struct cred *cred); -extern struct file * open_with_fake_path(const struct path *, int, - struct inode*, const struct cred *); +struct file *dentry_open(const struct path *path, int flags, + const struct cred *creds); +struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred); +struct file *backing_file_open(const struct path *path, int flags, + const struct path *real_path, + const struct cred *cred); +struct path *backing_file_real_path(struct file *f); + +/* + * file_real_path - get the path corresponding to f_inode + * + * When opening a backing file for a stackable filesystem (e.g., + * overlayfs) f_path may be on the stackable filesystem and f_inode on + * the underlying filesystem. When the path associated with f_inode is + * needed, this helper should be used instead of accessing f_path + * directly. +*/ +static inline const struct path *file_real_path(struct file *f) +{ + if (unlikely(f->f_mode & FMODE_BACKING)) + return backing_file_real_path(f); + return &f->f_path; +} + static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); @@ -2669,7 +2701,7 @@ extern void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* - * Userspace may rely on the the inode number being non-zero. For example, glibc + * Userspace may rely on the inode number being non-zero. For example, glibc * simply ignores files with zero i_ino in unlink() and other places. * * As an additional complication, if userspace was compiled with diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index bb8467cd11ae..ed48e4f1e755 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -91,11 +91,13 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) static inline int fsnotify_file(struct file *file, __u32 mask) { - const struct path *path = &file->f_path; + const struct path *path; if (file->f_mode & FMODE_NONOTIFY) return 0; + /* Overlayfs internal files have fake f_path */ + path = file_real_path(file); return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index e76605d5b36e..1eb7eae580be 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -143,8 +143,8 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *arg); int fsverity_ioctl_measure(struct file *filp, void __user *arg); int fsverity_get_digest(struct inode *inode, - u8 digest[FS_VERITY_MAX_DIGEST_SIZE], - enum hash_algo *alg); + u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], + u8 *alg, enum hash_algo *halg); /* open.c */ @@ -197,10 +197,14 @@ static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg) } static inline int fsverity_get_digest(struct inode *inode, - u8 digest[FS_VERITY_MAX_DIGEST_SIZE], - enum hash_algo *alg) + u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], + u8 *alg, enum hash_algo *halg) { - return -EOPNOTSUPP; + /* + * fsverity is not enabled in the kernel configuration, so always report + * that the file doesn't have fsverity enabled (digest size 0). + */ + return 0; } /* open.c */ diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 5c6db5533be6..67b8774eed8f 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -252,6 +252,14 @@ struct gpio_irq_chip { bool initialized; /** + * @domain_is_allocated_externally: + * + * True it the irq_domain was allocated outside of gpiolib, in which + * case gpiolib won't free the irq_domain itself. + */ + bool domain_is_allocated_externally; + + /** * @init_hw: optional routine to initialize hardware before * an IRQ chip will be added. This is quite useful when * a particular driver wants to clear IRQ related registers diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 4de1dbcd3ef6..68da30625a6c 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -507,7 +507,7 @@ static inline void folio_zero_range(struct folio *folio, zero_user_segments(&folio->page, start, start + length, 0, 0); } -static inline void put_and_unmap_page(struct page *page, void *addr) +static inline void unmap_and_put_page(struct page *page, void *addr) { kunmap_local(addr); put_page(page); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d5628a7b5eaa..c8dcfdbda1f4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1845,9 +1845,9 @@ int perf_event_exit_cpu(unsigned int cpu); #define perf_event_exit_cpu NULL #endif -extern void __weak arch_perf_update_userpage(struct perf_event *event, - struct perf_event_mmap_page *userpg, - u64 now); +extern void arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, + u64 now); #ifdef CONFIG_MMU extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index d2c3f16cf6b1..02e0086b10f6 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -261,18 +261,14 @@ void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; -#ifdef CONFIG_WATCH_QUEUE unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new); bool too_many_pipe_buffers_soft(unsigned long user_bufs); bool too_many_pipe_buffers_hard(unsigned long user_bufs); bool pipe_is_unprivileged_user(void); -#endif /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ -#ifdef CONFIG_WATCH_QUEUE int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); -#endif long pipe_fcntl(struct file *, unsigned int, unsigned long arg); struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 0260f5ea98fe..253f2676d93a 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -158,6 +158,8 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); #endif /* CONFIG_PROC_PID_ARCH_STATUS */ +void arch_report_meminfo(struct seq_file *m); + #else /* CONFIG_PROC_FS */ static inline void proc_root_init(void) diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 3c01c2bf84f5..505c908dbb81 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -196,11 +196,11 @@ enum { /* PCA9450_REG_LDO3_VOLT bits */ #define LDO3_EN_MASK 0xC0 -#define LDO3OUT_MASK 0x0F +#define LDO3OUT_MASK 0x1F /* PCA9450_REG_LDO4_VOLT bits */ #define LDO4_EN_MASK 0xC0 -#define LDO4OUT_MASK 0x0F +#define LDO4OUT_MASK 0x1F /* PCA9450_REG_LDO5_VOLT bits */ #define LDO5L_EN_MASK 0xC0 diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 762d7231e574..3b10636c51a9 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -509,6 +509,27 @@ static inline void svcxdr_init_encode(struct svc_rqst *rqstp) } /** + * svcxdr_encode_opaque_pages - Insert pages into an xdr_stream + * @xdr: xdr_stream to be updated + * @pages: array of pages to insert + * @base: starting offset of first data byte in @pages + * @len: number of data bytes in @pages to insert + * + * After the @pages are added, the tail iovec is instantiated pointing + * to end of the head buffer, and the stream is set up to encode + * subsequent items into the tail. + */ +static inline void svcxdr_encode_opaque_pages(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct page **pages, + unsigned int base, + unsigned int len) +{ + xdr_write_pages(xdr, pages, base, len); + xdr->page_ptr = rqstp->rq_next_page - 1; +} + +/** * svcxdr_set_auth_slack - * @rqstp: RPC transaction * @slack: buffer space to reserve for the transaction's security flavor diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index fbc4bd423b35..a5ee0af2a310 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -135,7 +135,6 @@ struct svc_rdma_recv_ctxt { struct ib_sge rc_recv_sge; void *rc_recv_buf; struct xdr_stream rc_stream; - bool rc_temp; u32 rc_byte_len; unsigned int rc_page_count; u32 rc_inv_rkey; @@ -155,12 +154,12 @@ struct svc_rdma_send_ctxt { struct ib_send_wr sc_send_wr; struct ib_cqe sc_cqe; - struct completion sc_done; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; void *sc_xprt_buf; + int sc_page_count; int sc_cur_sge_no; - + struct page *sc_pages[RPCSVC_MAXPAGES]; struct ib_sge sc_sges[]; }; diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 72014c9216fc..f89ec4b5ea16 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -242,8 +242,7 @@ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, struct rpc_rqst *rqst); extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); -extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, - size_t nbytes); +extern int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes); extern void __xdr_commit_encode(struct xdr_stream *xdr); extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len); extern void xdr_truncate_decode(struct xdr_stream *xdr, size_t len); diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index fc6bba20273b..45cd42f55d49 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -38,7 +38,7 @@ struct watch_filter { struct watch_queue { struct rcu_head rcu; struct watch_filter __rcu *filter; - struct pipe_inode_info *pipe; /* The pipe we're using as a buffer */ + struct pipe_inode_info *pipe; /* Pipe we use as a buffer, NULL if queue closed */ struct hlist_head watches; /* Contributory watches */ struct page **notes; /* Preallocated notifications */ unsigned long *notes_bitmap; /* Allocation bitmap for notes */ @@ -46,7 +46,6 @@ struct watch_queue { spinlock_t lock; unsigned int nr_notes; /* Number of notes */ unsigned int nr_pages; /* Number of pages in notes[] */ - bool defunct; /* T when queues closed */ }; /* diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 3992c994787f..683efe29fa69 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -68,7 +68,6 @@ enum { WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, - WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), /* * When a work item is off queue, its high bits point to the last @@ -79,12 +78,6 @@ enum { WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, - WORK_OFFQ_POOL_NONE = (1LU << WORK_OFFQ_POOL_BITS) - 1, - - /* convenience constants */ - WORK_STRUCT_FLAG_MASK = (1UL << WORK_STRUCT_FLAG_BITS) - 1, - WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK, - WORK_STRUCT_NO_POOL = (unsigned long)WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, @@ -94,6 +87,14 @@ enum { WORKER_DESC_LEN = 24, }; +/* Convenience constants - of type 'unsigned long', not 'enum'! */ +#define WORK_OFFQ_CANCELING (1ul << __WORK_OFFQ_CANCELING) +#define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) +#define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) + +#define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) +#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) + struct work_struct { atomic_long_t data; struct list_head entry; diff --git a/include/net/dsa.h b/include/net/dsa.h index 8903053fa5aa..ab0f0a5b0860 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -959,6 +959,14 @@ struct dsa_switch_ops { void (*port_disable)(struct dsa_switch *ds, int port); /* + * Compatibility between device trees defining multiple CPU ports and + * drivers which are not OK to use by default the numerically smallest + * CPU port of a switch for its local ports. This can return NULL, + * meaning "don't know/don't care". + */ + struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); + + /* * Port's MAC EEE settings */ int (*set_mac_eee)(struct dsa_switch *ds, int port, diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 83db182decc8..ee47d7143d99 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -472,7 +472,8 @@ struct nft_set_ops { int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); - void (*destroy)(const struct nft_set *set); + void (*destroy)(const struct nft_ctx *ctx, + const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; @@ -809,6 +810,8 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem); /** * struct nft_set_gc_batch_head - nf_tables set garbage collection batch @@ -901,6 +904,7 @@ struct nft_expr_type { enum nft_trans_phase { NFT_TRANS_PREPARE, + NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE @@ -1009,7 +1013,10 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase); +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, @@ -1104,6 +1111,8 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_set_elem *elem); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, @@ -1140,11 +1149,17 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); +static inline bool nft_chain_binding(const struct nft_chain *chain) +{ + return chain->flags & NFT_CHAIN_BINDING; +} + static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } +int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); @@ -1558,6 +1573,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) * struct nft_trans - nf_tables object update in transaction * * @list: used internally + * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context @@ -1565,6 +1581,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) */ struct nft_trans { struct list_head list; + struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; @@ -1575,6 +1592,7 @@ struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; + bool bound; }; #define nft_trans_rule(trans) \ @@ -1583,6 +1601,8 @@ struct nft_trans_rule { (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) +#define nft_trans_rule_bound(trans) \ + (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; @@ -1607,15 +1627,19 @@ struct nft_trans_set { (((struct nft_trans_set *)trans->data)->gc_int) struct nft_trans_chain { + struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; + bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; +#define nft_trans_chain(trans) \ + (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ @@ -1624,6 +1648,8 @@ struct nft_trans_chain { (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) +#define nft_trans_chain_bound(trans) \ + (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) #define nft_trans_basechain(trans) \ @@ -1700,6 +1726,7 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re struct nftables_pernet { struct list_head tables; struct list_head commit_list; + struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 33ee3f5936e6..151ca95dd08d 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1054,6 +1054,7 @@ struct xfrm_offload { struct sec_path { int len; int olen; + int verified_cnt; struct xfrm_state *xvec[XFRM_MAX_DEPTH]; struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 8f461e04e5f0..f8069ef2ee0f 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -2112,6 +2112,14 @@ DEFINE_POST_CHUNK_EVENT(read); DEFINE_POST_CHUNK_EVENT(write); DEFINE_POST_CHUNK_EVENT(reply); +DEFINE_EVENT(svcrdma_post_chunk_class, svcrdma_cc_release, + TP_PROTO( + const struct rpc_rdma_cid *cid, + int sqecount + ), + TP_ARGS(cid, sqecount) +); + TRACE_EVENT(svcrdma_wc_read, TP_PROTO( const struct ib_wc *wc, diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 31bc7025cb44..69e42ef30979 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -2104,31 +2104,46 @@ DEFINE_SVC_DEFERRED_EVENT(drop); DEFINE_SVC_DEFERRED_EVENT(queue); DEFINE_SVC_DEFERRED_EVENT(recv); -TRACE_EVENT(svcsock_new_socket, +DECLARE_EVENT_CLASS(svcsock_lifetime_class, TP_PROTO( + const void *svsk, const struct socket *socket ), - - TP_ARGS(socket), - + TP_ARGS(svsk, socket), TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(const void *, svsk) + __field(const void *, sk) __field(unsigned long, type) __field(unsigned long, family) - __field(bool, listener) + __field(unsigned long, state) ), - TP_fast_assign( + struct sock *sk = socket->sk; + + __entry->netns_ino = sock_net(sk)->ns.inum; + __entry->svsk = svsk; + __entry->sk = sk; __entry->type = socket->type; - __entry->family = socket->sk->sk_family; - __entry->listener = (socket->sk->sk_state == TCP_LISTEN); + __entry->family = sk->sk_family; + __entry->state = sk->sk_state; ), - - TP_printk("type=%s family=%s%s", - show_socket_type(__entry->type), + TP_printk("svsk=%p type=%s family=%s%s", + __entry->svsk, show_socket_type(__entry->type), rpc_show_address_family(__entry->family), - __entry->listener ? " (listener)" : "" + __entry->state == TCP_LISTEN ? " (listener)" : "" ) ); +#define DEFINE_SVCSOCK_LIFETIME_EVENT(name) \ + DEFINE_EVENT(svcsock_lifetime_class, name, \ + TP_PROTO( \ + const void *svsk, \ + const struct socket *socket \ + ), \ + TP_ARGS(svsk, socket)) + +DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_new); +DEFINE_SVCSOCK_LIFETIME_EVENT(svcsock_free); TRACE_EVENT(svcsock_marker, TP_PROTO( diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 86b2a82da546..54e353c9f919 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(writeback_folio_template, strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); - __entry->ino = mapping ? mapping->host->i_ino : 0; + __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), diff --git a/include/uapi/linux/eventfd.h b/include/uapi/linux/eventfd.h new file mode 100644 index 000000000000..2eb9ab6c32f3 --- /dev/null +++ b/include/uapi/linux/eventfd.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_EVENTFD_H +#define _UAPI_LINUX_EVENTFD_H + +#include <linux/fcntl.h> + +#define EFD_SEMAPHORE (1 << 0) +#define EFD_CLOEXEC O_CLOEXEC +#define EFD_NONBLOCK O_NONBLOCK + +#endif /* _UAPI_LINUX_EVENTFD_H */ diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 4d93967f8aea..8eb0d7b758d2 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -74,7 +74,8 @@ #define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ #define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ #define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ -#define MOVE_MOUNT__MASK 0x00000177 +#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ +#define MOVE_MOUNT__MASK 0x00000377 /* * fsopen() flags. diff --git a/init/do_mounts.c b/init/do_mounts.c index 811e94daf0a8..d67efddf8597 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -338,7 +338,7 @@ __setup("rootfstype=", fs_names_setup); __setup("rootdelay=", root_delay_setup); /* This can return zero length strings. Caller should check */ -static int __init split_fs_names(char *page, size_t size, char *names) +static int __init split_fs_names(char *page, size_t size) { int count = 1; char *p = page; @@ -402,7 +402,7 @@ void __init mount_block_root(char *name, int flags) scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)", MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); if (root_fs_names) - num_fs = split_fs_names(fs_names, PAGE_SIZE, root_fs_names); + num_fs = split_fs_names(fs_names, PAGE_SIZE); else num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE); retry: @@ -545,7 +545,7 @@ static int __init mount_nodev_root(void) fs_names = (void *)__get_free_page(GFP_KERNEL); if (!fs_names) return -EINVAL; - num_fs = split_fs_names(fs_names, PAGE_SIZE, root_fs_names); + num_fs = split_fs_names(fs_names, PAGE_SIZE); for (i = 0, fstype = fs_names; i < num_fs; i++, fstype += strlen(fstype) + 1) { diff --git a/io_uring/net.c b/io_uring/net.c index 51b0f7fbb4f5..c8a4b2ac00f7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -203,7 +203,7 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, &iomsg->free_iov); /* save msg_control as sys_sendmsg() overwrites it */ - sr->msg_control = iomsg->msg.msg_control; + sr->msg_control = iomsg->msg.msg_control_user; return ret; } @@ -302,7 +302,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; - kmsg->msg.msg_control = sr->msg_control; + kmsg->msg.msg_control_user = sr->msg_control; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) @@ -326,6 +326,8 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return io_setup_async_msg(req, kmsg, issue_flags); if (ret > 0 && io_net_retry(sock, flags)) { + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_control = NULL; sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_msg(req, kmsg, issue_flags); @@ -787,16 +789,19 @@ retry_multishot: flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); kmsg->msg.msg_get_inq = 1; - if (req->flags & REQ_F_APOLL_MULTISHOT) + if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_multishot(sock, sr, kmsg, flags, &mshot_finished); - else + } else { + /* disable partial retry for recvmsg with cmsg attached */ + if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); + } if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { diff --git a/io_uring/poll.c b/io_uring/poll.c index c90e47dc1e29..a78b8af7d9ab 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -977,8 +977,9 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - struct io_tw_state ts = {}; + struct io_tw_state ts = { .locked = true }; + io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); ret2 = io_poll_disarm(preq); if (bucket) @@ -990,12 +991,10 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) goto out; } - io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); ret2 = io_poll_disarm(preq); if (bucket) spin_unlock(&bucket->lock); - io_ring_submit_unlock(ctx, issue_flags); if (ret2) { ret = ret2; goto out; @@ -1019,7 +1018,7 @@ found: if (poll_update->update_user_data) preq->cqe.user_data = poll_update->new_user_data; - ret2 = io_poll_add(preq, issue_flags); + ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); /* successfully updated, don't complete poll request */ if (!ret2 || ret2 == -EIOCBQUEUED) goto out; @@ -1027,9 +1026,9 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - ts.locked = !(issue_flags & IO_URING_F_UNLOCKED); io_req_task_complete(preq, &ts); out: + io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { req_set_fail(req); return ret; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b682b8e4b50..72b32b7cd9cd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -744,13 +744,12 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) return offset < btf->hdr.str_len; } -static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +static bool __btf_name_char_ok(char c, bool first) { if ((first ? !isalpha(c) : !isalnum(c)) && c != '_' && - ((c == '.' && !dot_ok) || - c != '.')) + c != '.') return false; return true; } @@ -767,20 +766,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset) return NULL; } -static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) +static bool __btf_name_valid(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; - if (!__btf_name_char_ok(*src, true, dot_ok)) + if (!__btf_name_char_ok(*src, true)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!__btf_name_char_ok(*src, false, dot_ok)) + if (!__btf_name_char_ok(*src, false)) return false; src++; } @@ -788,17 +787,14 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) return !*src; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, false); + return __btf_name_valid(btf, offset); } static bool btf_name_valid_section(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, true); + return __btf_name_valid(btf, offset); } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) @@ -4422,7 +4418,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env, } if (!t->name_off || - !__btf_name_valid(env->btf, t->name_off, true)) { + !__btf_name_valid(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c21d0d8efe4..f1c8733f76b8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3440,6 +3440,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; + case BPF_PROG_TYPE_KPROBE: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + return 0; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5871aa78d01a..cf5f230360f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3868,6 +3868,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + /* Break the relation on a narrowing spill. */ + if (fls64(reg->umax_value) > BITS_PER_BYTE * size) + state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; @@ -17214,9 +17217,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) } /* finally lock prog and jit images for all functions and - * populate kallsysm + * populate kallsysm. Begin at the first subprogram, since + * bpf_prog_load will add the kallsyms for the main program. */ - for (i = 0; i < env->subprog_cnt; i++) { + for (i = 1; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -17242,6 +17246,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->jited_len = func[0]->jited_len; + prog->aux->extable = func[0]->aux->extable; + prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245cf62ce85a..4d42f0cbc11e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1798,7 +1798,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1842,7 +1842,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1860,9 +1861,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 936473203a6b..122dacb3a443 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -108,16 +108,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css) struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); + cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); + cpus_read_unlock(); return 0; } @@ -132,14 +134,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); + cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 65b8658da829..e9138cd7a0f5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - ktime_t next_p; - u32 rem; - tick_do_timer_cpu = cpu; - - next_p = ktime_get(); - div_u64_rem(next_p, TICK_NSEC, &rem); - if (rem) { - next_p -= rem; - next_p += TICK_NSEC; - } - - tick_next_period = next_p; + tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52254679ec48..42c0be3080bd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ - if (last_jiffies_update == 0) + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + last_jiffies_update = tick_next_period; + } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index dbb14705d0d3..8df0550415e7 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -50,6 +50,18 @@ #define EVENT_STATUS_OTHER BIT(7) /* + * User register flags are not allowed yet, keep them here until we are + * ready to expose them out to the user ABI. + */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + +/* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. */ @@ -85,8 +97,10 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; + struct work_struct put_work; refcount_t refcnt; int min_size; + int reg_flags; char status; }; @@ -165,76 +179,151 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser); + struct user_event **newuser, int reg_flags); static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); +static int destroy_user_event(struct user_event *user); static u32 user_event_key(char *name) { return jhash(name, strlen(name), 0); } -static void user_event_group_destroy(struct user_event_group *group) +static struct user_event *user_event_get(struct user_event *user) { - kfree(group->system_name); - kfree(group); + refcount_inc(&user->refcnt); + + return user; } -static char *user_event_group_system_name(struct user_namespace *user_ns) +static void delayed_destroy_user_event(struct work_struct *work) { - char *system_name; - int len = sizeof(USER_EVENTS_SYSTEM) + 1; + struct user_event *user = container_of( + work, struct user_event, put_work); - if (user_ns != &init_user_ns) { + mutex_lock(&event_mutex); + + if (!refcount_dec_and_test(&user->refcnt)) + goto out; + + if (destroy_user_event(user)) { /* - * Unexpected at this point: - * We only currently support init_user_ns. - * When we enable more, this will trigger a failure so log. + * The only reason this would fail here is if we cannot + * update the visibility of the event. In this case the + * event stays in the hashtable, waiting for someone to + * attempt to delete it later. */ - pr_warn("user_events: Namespace other than init_user_ns!\n"); - return NULL; + pr_warn("user_events: Unable to delete event\n"); + refcount_set(&user->refcnt, 1); } +out: + mutex_unlock(&event_mutex); +} - system_name = kmalloc(len, GFP_KERNEL); +static void user_event_put(struct user_event *user, bool locked) +{ + bool delete; - if (!system_name) - return NULL; + if (unlikely(!user)) + return; - snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + /* + * When the event is not enabled for auto-delete there will always + * be at least 1 reference to the event. During the event creation + * we initially set the refcnt to 2 to achieve this. In those cases + * the caller must acquire event_mutex and after decrement check if + * the refcnt is 1, meaning this is the last reference. When auto + * delete is enabled, there will only be 1 ref, IE: refcnt will be + * only set to 1 during creation to allow the below checks to go + * through upon the last put. The last put must always be done with + * the event mutex held. + */ + if (!locked) { + lockdep_assert_not_held(&event_mutex); + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex); + } else { + lockdep_assert_held(&event_mutex); + delete = refcount_dec_and_test(&user->refcnt); + } - return system_name; + if (!delete) + return; + + /* + * We now have the event_mutex in all cases, which ensures that + * no new references will be taken until event_mutex is released. + * New references come through find_user_event(), which requires + * the event_mutex to be held. + */ + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* We should not get here when persist flag is set */ + pr_alert("BUG: Auto-delete engaged on persistent event\n"); + goto out; + } + + /* + * Unfortunately we have to attempt the actual destroy in a work + * queue. This is because not all cases handle a trace_event_call + * being removed within the class->reg() operation for unregister. + */ + INIT_WORK(&user->put_work, delayed_destroy_user_event); + + /* + * Since the event is still in the hashtable, we have to re-inc + * the ref count to 1. This count will be decremented and checked + * in the work queue to ensure it's still the last ref. This is + * needed because a user-process could register the same event in + * between the time of event_mutex release and the work queue + * running the delayed destroy. If we removed the item now from + * the hashtable, this would result in a timing window where a + * user process would fail a register because the trace_event_call + * register would fail in the tracing layers. + */ + refcount_set(&user->refcnt, 1); + + if (WARN_ON_ONCE(!schedule_work(&user->put_work))) { + /* + * If we fail we must wait for an admin to attempt delete or + * another register/close of the event, whichever is first. + */ + pr_warn("user_events: Unable to queue delayed destroy\n"); + } +out: + /* Ensure if we didn't have event_mutex before we unlock it */ + if (!locked) + mutex_unlock(&event_mutex); } -static inline struct user_event_group -*user_event_group_from_user_ns(struct user_namespace *user_ns) +static void user_event_group_destroy(struct user_event_group *group) { - if (user_ns == &init_user_ns) - return init_group; - - return NULL; + kfree(group->system_name); + kfree(group); } -static struct user_event_group *current_user_event_group(void) +static char *user_event_group_system_name(void) { - struct user_namespace *user_ns = current_user_ns(); - struct user_event_group *group = NULL; + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; - while (user_ns) { - group = user_event_group_from_user_ns(user_ns); + system_name = kmalloc(len, GFP_KERNEL); - if (group) - break; + if (!system_name) + return NULL; - user_ns = user_ns->parent; - } + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); - return group; + return system_name; } -static struct user_event_group -*user_event_group_create(struct user_namespace *user_ns) +static struct user_event_group *current_user_event_group(void) +{ + return init_group; +} + +static struct user_event_group *user_event_group_create(void) { struct user_event_group *group; @@ -243,7 +332,7 @@ static struct user_event_group if (!group) return NULL; - group->system_name = user_event_group_system_name(user_ns); + group->system_name = user_event_group_system_name(); if (!group->system_name) goto error; @@ -259,12 +348,13 @@ error: return NULL; }; -static void user_event_enabler_destroy(struct user_event_enabler *enabler) +static void user_event_enabler_destroy(struct user_event_enabler *enabler, + bool locked) { list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ - refcount_dec(&enabler->event->refcnt); + user_event_put(enabler->event, locked); kfree(enabler); } @@ -326,7 +416,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) /* User asked for enabler to be removed during fault */ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); goto out; } @@ -501,14 +591,12 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, if (!enabler) return false; - enabler->event = orig->event; + enabler->event = user_event_get(orig->event); enabler->addr = orig->addr; /* Only dup part of value (ignore future flags, etc) */ enabler->values = orig->values & ENABLE_VAL_DUP_MASK; - refcount_inc(&enabler->event->refcnt); - /* Enablers not exposed yet, RCU not required */ list_add(&enabler->mm_enablers_link, &mm->enablers); @@ -625,7 +713,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm) struct user_event_enabler *enabler, *next; list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, false); mmdrop(mm->mm); kfree(mm); @@ -780,7 +868,7 @@ retry: * exit or run exec(), which includes forks and clones. */ if (!*write_result) { - refcount_inc(&enabler->event->refcnt); + user_event_get(user); list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } @@ -803,7 +891,12 @@ out: static __always_inline __must_check bool user_event_last_ref(struct user_event *user) { - return refcount_read(&user->refcnt) == 1; + int last = 0; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) + last = 1; + + return refcount_read(&user->refcnt) == last; } static __always_inline __must_check @@ -842,7 +935,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * Upon success user_event has its ref count increased by 1. */ static int user_event_parse_cmd(struct user_event_group *group, - char *raw_command, struct user_event **newuser) + char *raw_command, struct user_event **newuser, + int reg_flags) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -856,7 +950,7 @@ static int user_event_parse_cmd(struct user_event_group *group, if (flags) *flags++ = '\0'; - return user_event_parse(group, name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser, reg_flags); } static int user_field_array_size(const char *type) @@ -1367,10 +1461,8 @@ static struct user_event *find_user_event(struct user_event_group *group, *outkey = key; hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) { - refcount_inc(&user->refcnt); - return user; - } + if (!strcmp(EVENT_NAME(user), name)) + return user_event_get(user); return NULL; } @@ -1432,7 +1524,7 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i, if (unlikely(!entry)) return; - if (unlikely(!copy_nofault(entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1473,7 +1565,7 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i, perf_fetch_caller_regs(regs); - if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1584,12 +1676,12 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - refcount_inc(&user->refcnt); + user_event_get(user); update_enable_bit_for(user); return 0; dec: update_enable_bit_for(user); - refcount_dec(&user->refcnt); + user_event_put(user, true); return 0; } @@ -1620,10 +1712,11 @@ static int user_event_create(const char *raw_command) mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(group, name, &user); + /* Dyn events persist, otherwise they would cleanup immediately */ + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST); if (!ret) - refcount_dec(&user->refcnt); + user_event_put(user, false); mutex_unlock(&group->reg_mutex); @@ -1745,6 +1838,8 @@ static bool user_event_match(const char *system, const char *event, if (match && argc > 0) match = user_fields_match(user, argc, argv); + else if (match && argc == 0) + match = list_empty(&user->fields); return match; } @@ -1781,11 +1876,17 @@ static int user_event_trace_register(struct user_event *user) */ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser) + struct user_event **newuser, int reg_flags) { int ret; u32 key; struct user_event *user; + int argc = 0; + char **argv; + + /* User register flags are not ready yet */ + if (reg_flags != 0 || flags != NULL) + return -EINVAL; /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); @@ -1793,13 +1894,35 @@ static int user_event_parse(struct user_event_group *group, char *name, mutex_unlock(&event_mutex); if (user) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + if (!argv) { + ret = -ENOMEM; + goto error; + } + + ret = user_fields_match(user, argc, (const char **)argv); + argv_free(argv); + + } else + ret = list_empty(&user->fields); + + if (ret) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + } else { + ret = -EADDRINUSE; + goto error; + } + return 0; +error: + user_event_put(user, false); + return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); @@ -1852,8 +1975,15 @@ static int user_event_parse(struct user_event_group *group, char *name, if (ret) goto put_user_lock; - /* Ensure we track self ref and caller ref (2) */ - refcount_set(&user->refcnt, 2); + user->reg_flags = reg_flags; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); + } else { + /* Ensure we track only caller ref (1) */ + refcount_set(&user->refcnt, 1); + } dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); @@ -1885,7 +2015,7 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user) return -ENOENT; - refcount_dec(&user->refcnt); + user_event_put(user, true); if (!user_event_last_ref(user)) return -EBUSY; @@ -2044,9 +2174,7 @@ static int user_events_ref_add(struct user_event_file_info *info, for (i = 0; i < count; ++i) new_refs->events[i] = refs->events[i]; - new_refs->events[i] = user; - - refcount_inc(&user->refcnt); + new_refs->events[i] = user_event_get(user); rcu_assign_pointer(info->refs, new_refs); @@ -2077,8 +2205,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (ret) return ret; - /* Ensure no flags, since we don't support any yet */ - if (kreg->flags != 0) + /* Ensure only valid flags */ + if (kreg->flags & ~(USER_EVENT_REG_MAX-1)) return -EINVAL; /* Ensure supported size */ @@ -2150,7 +2278,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, return ret; } - ret = user_event_parse_cmd(info->group, name, &user); + ret = user_event_parse_cmd(info->group, name, &user, reg.flags); if (ret) { kfree(name); @@ -2160,7 +2288,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - refcount_dec(&user->refcnt); + user_event_put(user, false); /* Positive number is index and valid */ if (ret < 0) @@ -2309,7 +2437,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); /* Removed at least one */ ret = 0; @@ -2367,7 +2495,6 @@ static int user_events_release(struct inode *node, struct file *file) struct user_event_file_info *info = file->private_data; struct user_event_group *group; struct user_event_refs *refs; - struct user_event *user; int i; if (!info) @@ -2391,12 +2518,9 @@ static int user_events_release(struct inode *node, struct file *file) * The underlying user_events are ref counted, and cannot be freed. * After this decrement, the user_events may be freed elsewhere. */ - for (i = 0; i < refs->count; ++i) { - user = refs->events[i]; + for (i = 0; i < refs->count; ++i) + user_event_put(refs->events[i], false); - if (user) - refcount_dec(&user->refcnt); - } out: file->private_data = NULL; @@ -2577,7 +2701,7 @@ static int __init trace_events_user_init(void) if (!fault_cache) return -ENOMEM; - init_group = user_event_group_create(&init_user_ns); + init_group = user_event_group_create(); if (!init_group) { kmem_cache_destroy(fault_cache); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 15f05faaae44..1e33f367783e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -847,7 +847,7 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c int ret; void *pos; - list_for_each_entry(field, head, link) { + list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); if (field->offset + field->size > iter->ent_size) { trace_seq_puts(&iter->seq, "<OVERFLOW>"); diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index e91cb4c2833f..d0b6b390ee42 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -42,7 +42,7 @@ MODULE_AUTHOR("Red Hat, Inc."); static inline bool lock_wqueue(struct watch_queue *wqueue) { spin_lock_bh(&wqueue->lock); - if (unlikely(wqueue->defunct)) { + if (unlikely(!wqueue->pipe)) { spin_unlock_bh(&wqueue->lock); return false; } @@ -104,9 +104,6 @@ static bool post_one_notification(struct watch_queue *wqueue, unsigned int head, tail, mask, note, offset, len; bool done = false; - if (!pipe) - return false; - spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; @@ -603,8 +600,11 @@ void watch_queue_clear(struct watch_queue *wqueue) rcu_read_lock(); spin_lock_bh(&wqueue->lock); - /* Prevent new notifications from being stored. */ - wqueue->defunct = true; + /* + * This pipe can be freed by callers like free_pipe_info(). + * Removing this reference also prevents new notifications. + */ + wqueue->pipe = NULL; while (!hlist_empty(&wqueue->watches)) { watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4666a1a92a31..c913e333cce8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -705,12 +705,17 @@ static void clear_work_data(struct work_struct *work) set_work_data(work, WORK_STRUCT_NO_POOL, 0); } +static inline struct pool_workqueue *work_struct_pwq(unsigned long data) +{ + return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); +} + static struct pool_workqueue *get_work_pwq(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + return work_struct_pwq(data); else return NULL; } @@ -738,8 +743,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool; + return work_struct_pwq(data)->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT; if (pool_id == WORK_OFFQ_POOL_NONE) @@ -760,8 +764,7 @@ static int get_work_pool_id(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); if (data & WORK_STRUCT_PWQ) - return ((struct pool_workqueue *) - (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; + return work_struct_pwq(data)->pool->id; return data >> WORK_OFFQ_POOL_SHIFT; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b9d39d65b73..2d0d58fb4e7f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2070,7 +2070,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); - xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); diff --git a/mm/memfd.c b/mm/memfd.c index 69b90c31d38c..e763e76f1106 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create, inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - *file_seals |= F_SEAL_EXEC; + if (file_seals) { + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); diff --git a/mm/mprotect.c b/mm/mprotect.c index 92d3d3ca390a..c59e7561698c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -867,7 +867,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (!error && vma_iter_end(&vmi) < end) + if (!error && tmp < end) error = -ENOMEM; out: diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index fe10436d9911..3ab53fad8876 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -5,12 +5,10 @@ #include <linux/seq_file.h> #include <linux/shrinker.h> #include <linux/memcontrol.h> -#include <linux/srcu.h> /* defined in vmscan.c */ -extern struct mutex shrinker_mutex; +extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -extern struct srcu_struct shrinker_srcu; static DEFINE_IDA(shrinker_debugfs_ida); static struct dentry *shrinker_debugfs_root; @@ -51,13 +49,18 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret = 0, nid, srcu_idx; + int ret, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -88,7 +91,8 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); + up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -111,8 +115,9 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, .gfp_mask = GFP_KERNEL, }; struct mem_cgroup *memcg = NULL; - int nid, srcu_idx; + int nid; char kbuf[72]; + ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -141,7 +146,11 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } sc.nid = nid; sc.memcg = memcg; @@ -150,7 +159,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; @@ -168,7 +177,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -211,7 +220,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); old = shrinker->name; shrinker->name = new; @@ -229,7 +238,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); kfree_const(old); @@ -242,7 +251,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); shrinker->name = NULL; @@ -271,14 +280,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9683573f1225..1d13d71687d7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3098,11 +3098,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - /* vm_area_alloc_pages() can also fail due to a fatal signal */ - if (!fatal_signal_pending(current)) + /* + * vm_area_alloc_pages() can fail due to insufficient memory but + * also:- + * + * - a pending fatal signal + * - insufficient huge page-order pages + * + * Since we always retry allocations at order-0 in the huge page + * case a warning for either is spurious. + */ + if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + "vmalloc error: size %lu, failed to allocate pages", + area->nr_pages * PAGE_SIZE); goto fail; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d0cd2840cf0..5bf98d0a22c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,7 @@ #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -57,7 +57,6 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> -#include <linux/srcu.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -190,9 +189,7 @@ struct scan_control { int vm_swappiness = 60; LIST_HEAD(shrinker_list); -DEFINE_MUTEX(shrinker_mutex); -DEFINE_SRCU(shrinker_srcu); -static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); +DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -211,21 +208,8 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu, - lockdep_is_held(&shrinker_mutex)); -} - -static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, - int nid) -{ - return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu); -} - -static void free_shrinker_info_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct shrinker_info, rcu)); + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, @@ -266,7 +250,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); + kvfree_rcu(old, rcu); } return 0; @@ -292,7 +276,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, size, ret = 0; int map_size, defer_size = 0; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); map_size = shrinker_map_size(shrinker_nr_max); defer_size = shrinker_defer_size(shrinker_nr_max); size = map_size + defer_size; @@ -308,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) info->map_nr_max = shrinker_nr_max; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -324,7 +308,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); map_size = shrinker_map_size(new_nr_max); defer_size = shrinker_defer_size(new_nr_max); @@ -352,16 +336,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - int srcu_idx; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); } } @@ -374,7 +357,8 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -388,7 +372,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -398,7 +382,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); idr_remove(&shrinker_idr, id); } @@ -408,7 +392,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -417,7 +401,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -433,7 +417,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - mutex_lock(&shrinker_mutex); + down_read(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,7 +426,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - mutex_unlock(&shrinker_mutex); + up_read(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -743,9 +727,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker) shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return; } @@ -755,11 +739,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { - mutex_lock(&shrinker_mutex); - list_add_tail_rcu(&shrinker->list, &shrinker_list); + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static int __register_shrinker(struct shrinker *shrinker) @@ -810,16 +794,13 @@ void unregister_shrinker(struct shrinker *shrinker) if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - mutex_lock(&shrinker_mutex); - list_del_rcu(&shrinker->list); + down_write(&shrinker_rwsem); + list_del(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - mutex_unlock(&shrinker_mutex); - - atomic_inc(&shrinker_srcu_generation); - synchronize_srcu(&shrinker_srcu); + up_write(&shrinker_rwsem); shrinker_debugfs_remove(debugfs_entry, debugfs_id); @@ -831,13 +812,15 @@ EXPORT_SYMBOL(unregister_shrinker); /** * synchronize_shrinkers - Wait for all running shrinkers to complete. * - * This is useful to guarantee that all shrinker invocations have seen an - * update, before freeing memory. + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. */ void synchronize_shrinkers(void) { - atomic_inc(&shrinker_srcu_generation); - synchronize_srcu(&shrinker_srcu); + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -946,20 +929,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx, generation; - int i = 0; + int i; if (!mem_cgroup_online(memcg)) return 0; -again: - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; - generation = atomic_read(&shrinker_srcu_generation); - for_each_set_bit_from(i, info->map, info->map_nr_max) { + for_each_set_bit(i, info->map, info->map_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1005,14 +987,14 @@ again: set_shrinker_bit(memcg, nid, i); } freed += ret; - if (atomic_read(&shrinker_srcu_generation) != generation) { - srcu_read_unlock(&shrinker_srcu, srcu_idx); - i++; - goto again; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; } } unlock: - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); return freed; } #else /* CONFIG_MEMCG */ @@ -1049,7 +1031,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx, generation; /* * The root memcg might be allocated even though memcg is disabled @@ -1061,11 +1042,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - srcu_idx = srcu_read_lock(&shrinker_srcu); + if (!down_read_trylock(&shrinker_rwsem)) + goto out; - generation = atomic_read(&shrinker_srcu_generation); - list_for_each_entry_srcu(shrinker, &shrinker_list, list, - srcu_read_lock_held(&shrinker_srcu)) { + list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1076,14 +1056,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - - if (atomic_read(&shrinker_srcu_generation) != generation) { + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { freed = freed ? : 1; break; } } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); +out: cond_resched(); return freed; } diff --git a/net/core/sock.c b/net/core/sock.c index 24f2761bdb1d..6e5662ca00fe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,12 +1362,6 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: - if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - break; - } - sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ab1afe67fd18..1afed89e03c0 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -403,6 +403,24 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) return 0; } +static struct dsa_port * +dsa_switch_preferred_default_local_cpu_port(struct dsa_switch *ds) +{ + struct dsa_port *cpu_dp; + + if (!ds->ops->preferred_default_local_cpu_port) + return NULL; + + cpu_dp = ds->ops->preferred_default_local_cpu_port(ds); + if (!cpu_dp) + return NULL; + + if (WARN_ON(!dsa_port_is_cpu(cpu_dp) || cpu_dp->ds != ds)) + return NULL; + + return cpu_dp; +} + /* Perform initial assignment of CPU ports to user ports and DSA links in the * fabric, giving preference to CPU ports local to each switch. Default to * using the first CPU port in the switch tree if the port does not have a CPU @@ -410,12 +428,16 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) */ static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp, *dp; + struct dsa_port *preferred_cpu_dp, *cpu_dp, *dp; list_for_each_entry(cpu_dp, &dst->ports, list) { if (!dsa_port_is_cpu(cpu_dp)) continue; + preferred_cpu_dp = dsa_switch_preferred_default_local_cpu_port(cpu_dp->ds); + if (preferred_cpu_dp && preferred_cpu_dp != cpu_dp) + continue; + /* Prefer a local CPU port */ dsa_switch_for_each_port(dp, cpu_dp->ds) { /* Prefer the first local CPU port found */ diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index e5d8439b9e45..c16db0b326fa 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -13,7 +13,7 @@ #define MAXNAME 32 #define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME) -#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \ +#define WPAN_PHY_ASSIGN strscpy(__entry->wpan_phy_name, \ wpan_phy_name(wpan_phy), \ MAXNAME) #define WPAN_PHY_PR_FMT "%s" diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 3969fa805679..ee848be59e65 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -340,6 +340,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index ad2afeef4f10..eac206a290d0 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -164,6 +164,7 @@ drop: kfree_skb(skb); return 0; } +EXPORT_SYMBOL(xfrm4_udp_encap_rcv); int xfrm4_rcv(struct sk_buff *skb) { diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 75c02992c520..772340268997 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -374,6 +374,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 04cbeefd8982..4907ab241d6b 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -86,6 +86,9 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) __be32 *udpdata32; __u16 encap_type = up->encap_type; + if (skb->protocol == htons(ETH_P_IP)) + return xfrm4_udp_encap_rcv(sk, skb); + /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d996aa2579df..fc6e130364da 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2110,7 +2110,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED; - if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE && + if (unlikely(ieee80211_is_beacon(fc) && (result & RX_DROP_UNUSABLE) && rx->sdata->dev)) cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, skb->data, skb->len); diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h index 689396d6c76a..1574ecc48075 100644 --- a/net/mac802154/trace.h +++ b/net/mac802154/trace.h @@ -14,7 +14,7 @@ #define MAXNAME 32 #define LOCAL_ENTRY __array(char, wpan_phy_name, MAXNAME) -#define LOCAL_ASSIGN strlcpy(__entry->wpan_phy_name, \ +#define LOCAL_ASSIGN strscpy(__entry->wpan_phy_name, \ wpan_phy_name(local->hw.phy), MAXNAME) #define LOCAL_PR_FMT "%s" #define LOCAL_PR_ARG __entry->wpan_phy_name diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 59f8f3124855..1224dfca5bf3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1047,6 +1047,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; + inet_sk_state_store(newsk, TCP_LISTEN); err = kernel_listen(ssock, backlog); if (err) return err; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 67311e7d5b21..a6c7f2d24909 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -44,7 +44,7 @@ enum { static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); -static void __mptcp_check_send_data_fin(struct sock *sk); +static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); static struct net_device mptcp_napi_dev; @@ -424,8 +424,7 @@ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - return !__mptcp_check_fallback(msk) && - ((1 << sk->sk_state) & + return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } @@ -583,9 +582,6 @@ static bool mptcp_check_data_fin(struct sock *sk) u64 rcv_data_fin_seq; bool ret = false; - if (__mptcp_check_fallback(msk)) - return ret; - /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options @@ -623,7 +619,8 @@ static bool mptcp_check_data_fin(struct sock *sk) } ret = true; - mptcp_send_ack(msk); + if (!__mptcp_check_fallback(msk)) + mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; @@ -850,12 +847,12 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) return true; } -static void __mptcp_flush_join_list(struct sock *sk) +static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { + list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); @@ -897,49 +894,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -void mptcp_subflow_eof(struct sock *sk) -{ - if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) - mptcp_schedule_work(sk); -} - -static void mptcp_check_for_eof(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - int receivers = 0; - - mptcp_for_each_subflow(msk, subflow) - receivers += !subflow->rx_eof; - if (receivers) - return; - - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { - /* hopefully temporary hack: propagate shutdown status - * to msk, when all subflows agree on it - */ - WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); - - smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - sk->sk_data_ready(sk); - } - - switch (sk->sk_state) { - case TCP_ESTABLISHED: - inet_sk_state_store(sk, TCP_CLOSE_WAIT); - break; - case TCP_FIN_WAIT1: - inet_sk_state_store(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - inet_sk_state_store(sk, TCP_CLOSE); - break; - default: - return; - } - mptcp_close_wake_up(sk); -} - static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -1609,7 +1563,7 @@ out: if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); if (do_check_data_fin) - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) @@ -1727,7 +1681,13 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { - mptcp_disconnect(sk, 0); + /* The disconnect() op called by tcp_sendmsg_fastopen()/ + * __inet_stream_connect() can fail, due to looking check, + * see mptcp_disconnect(). + * Attempt it again outside the problematic scope. + */ + if (!mptcp_disconnect(sk, 0)) + sk->sk_socket->state = SS_UNCONNECTED; } inet_sk(sk)->defer_connect = 0; @@ -2158,9 +2118,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } - if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) - mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check @@ -2389,7 +2346,10 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { - tcp_disconnect(ssk, 0); + /* The MPTCP code never wait on the subflow sockets, TCP-level + * disconnect should never fail + */ + WARN_ON_ONCE(tcp_disconnect(ssk, 0)); msk->subflow->state = SS_UNCONNECTED; mptcp_subflow_ctx_reset(subflow); release_sock(ssk); @@ -2408,13 +2368,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) { - tcp_set_state(ssk, TCP_CLOSE); - mptcp_subflow_queue_clean(sk, ssk); - inet_csk_listen_stop(ssk); - mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); - } - __tcp_close(ssk, 0); /* close acquired an extra ref */ @@ -2671,16 +2624,12 @@ static void mptcp_worker(struct work_struct *work) if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; - mptcp_check_data_fin_ack(sk); - mptcp_check_fastclose(msk); mptcp_pm_nl_work(msk); - if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) - mptcp_check_for_eof(msk); - - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); + mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2812,13 +2761,19 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) break; fallthrough; case TCP_SYN_SENT: - tcp_disconnect(ssk, O_NONBLOCK); + WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { pr_debug("Fallback"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); + + /* simulate the data_fin ack reception to let the state + * machine move forward + */ + WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); + mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); tcp_send_ack(ssk); @@ -2858,7 +2813,7 @@ static int mptcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } -static void __mptcp_check_send_data_fin(struct sock *sk) +static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); @@ -2876,19 +2831,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk) WRITE_ONCE(msk->snd_nxt, msk->write_seq); - /* fallback socket will not get data_fin/ack, can move to the next - * state now - */ - if (__mptcp_check_fallback(msk)) { - WRITE_ONCE(msk->snd_una, msk->write_seq); - if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { - inet_sk_state_store(sk, TCP_CLOSE); - mptcp_close_wake_up(sk); - } else if (sk->sk_state == TCP_FIN_WAIT1) { - inet_sk_state_store(sk, TCP_FIN_WAIT2); - } - } - mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2908,7 +2850,7 @@ static void __mptcp_wr_shutdown(struct sock *sk) WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) @@ -2953,10 +2895,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) return EPOLLIN | EPOLLRDNORM; } -static void mptcp_listen_inuse_dec(struct sock *sk) +static void mptcp_check_listen_stop(struct sock *sk) { - if (inet_sk_state_load(sk) == TCP_LISTEN) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + struct sock *ssk; + + if (inet_sk_state_load(sk) != TCP_LISTEN) + return; + + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + ssk = mptcp_sk(sk)->first; + if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) + return; + + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); + tcp_set_state(ssk, TCP_CLOSE); + release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) @@ -2969,7 +2925,7 @@ bool __mptcp_close(struct sock *sk, long timeout) WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { - mptcp_listen_inuse_dec(sk); + mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); goto cleanup; } @@ -3073,15 +3029,20 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); + /* Deny disconnect if other threads are blocked in sk_wait_event() + * or inet_wait_for_connect(). + */ + if (sk->sk_wait_pending) + return -EBUSY; + /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under - * msk->firstsocket lock). Do nothing and leave the cleanup to the - * caller. + * msk->firstsocket lock). */ if (msk->fastopening) - return 0; + return -EBUSY; - mptcp_listen_inuse_dec(sk); + mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); @@ -3140,6 +3101,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif + nsk->sk_wait_pending = 0; __mptcp_init_sock(nsk); msk = mptcp_sk(nsk); @@ -3327,9 +3289,14 @@ static void mptcp_release_cb(struct sock *sk) for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | msk->push_pending; + struct list_head join_list; + if (!flags) break; + INIT_LIST_HEAD(&join_list); + list_splice_init(&msk->join_list, &join_list); + /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope @@ -3340,8 +3307,9 @@ static void mptcp_release_cb(struct sock *sk) msk->push_pending = 0; msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); + if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) - __mptcp_flush_join_list(sk); + __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 70c957bc56a8..d3783a7056e1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -113,7 +113,6 @@ /* MPTCP socket atomic flags */ #define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 -#define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 @@ -476,14 +475,13 @@ struct mptcp_subflow_context { send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, - rx_eof : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ - __unused : 8; + __unused : 9; enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -720,7 +718,6 @@ static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); -void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4688daa6b38b..d9c8b21c6076 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1749,14 +1749,16 @@ static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; + struct mptcp_sock *msk; __subflow_state_change(sk); + msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); - mptcp_rcv_space_init(mptcp_sk(parent), sk); - pr_fallback(mptcp_sk(parent)); + mptcp_rcv_space_init(msk, sk); + pr_fallback(msk); subflow->conn_finished = 1; mptcp_set_connected(parent); } @@ -1772,11 +1774,12 @@ static void subflow_state_change(struct sock *sk) subflow_sched_work_if_closed(mptcp_sk(parent), sk); - if (__mptcp_check_fallback(mptcp_sk(parent)) && - !subflow->rx_eof && subflow_is_done(sk)) { - subflow->rx_eof = 1; - mptcp_subflow_eof(parent); - } + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index feb1d7fcb09f..a80b960223e1 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1207,6 +1207,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1349,6 +1350,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 69bceefaa5c8..4c7937fd803f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,6 +151,7 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return NULL; INIT_LIST_HEAD(&trans->list); + INIT_LIST_HEAD(&trans->binding_list); trans->msg_type = msg_type; trans->ctx = *ctx; @@ -163,13 +164,20 @@ static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } -static void nft_trans_destroy(struct nft_trans *trans) +static void nft_trans_list_del(struct nft_trans *trans) { list_del(&trans->list); + list_del(&trans->binding_list); +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + nft_trans_list_del(trans); kfree(trans); } -static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, + bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; @@ -183,16 +191,80 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) - nft_trans_set_bound(trans) = true; + nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) - nft_trans_elem_set_bound(trans) = true; + nft_trans_elem_set_bound(trans) = bind; break; } } } +static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, true); +} + +static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, false); +} + +static void __nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain, bool bind) +{ + struct nftables_pernet *nft_net; + struct net *net = ctx->net; + struct nft_trans *trans; + + if (!nft_chain_binding(chain)) + return; + + nft_net = nft_pernet(net); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain(trans) == chain) + nft_trans_chain_bound(trans) = bind; + break; + case NFT_MSG_NEWRULE: + if (trans->ctx.chain == chain) + nft_trans_rule_bound(trans) = bind; + break; + } + } +} + +static void nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, true); +} + +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + if (!nft_chain_binding(chain)) + return 0; + + if (nft_chain_binding(ctx->chain)) + return -EOPNOTSUPP; + + if (chain->bound) + return -EBUSY; + + chain->bound = true; + chain->use++; + nft_chain_trans_bind(ctx, chain); + + return 0; +} + +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, false); +} + static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { @@ -292,6 +364,19 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr { struct nftables_pernet *nft_net = nft_pernet(net); + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; + } + list_add_tail(&trans->list, &nft_net->commit_list); } @@ -338,8 +423,9 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); } } - + nft_trans_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); + return trans; } @@ -357,8 +443,7 @@ static int nft_delchain(struct nft_ctx *ctx) return 0; } -static void nft_rule_expr_activate(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr; @@ -371,9 +456,8 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, } } -static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, - struct nft_rule *rule, - enum nft_trans_phase phase) +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase) { struct nft_expr *expr; @@ -495,6 +579,58 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } +static void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_deactivate(ctx->net, set, elem); + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + nft_setelem_data_deactivate(ctx->net, set, &elem); + break; + } +} + +static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_deactivate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_deactivate(ctx, set); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -503,6 +639,9 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); ctx->table->use--; @@ -2226,7 +2365,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, return 0; } -static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) +int nft_chain_add(struct nft_table *table, struct nft_chain *chain) { int err; @@ -2528,6 +2667,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_basechain(trans) = basechain; INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); list_splice(&hook.list, &nft_trans_chain_hooks(trans)); + if (nla[NFTA_CHAIN_HOOK]) + module_put(hook.type->owner); nft_trans_commit_list_add_tail(ctx->net, trans); @@ -2670,21 +2811,18 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack); } -static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_chain *chain, +static int nft_delchain_hook(struct nft_ctx *ctx, + struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { + const struct nft_chain *chain = &basechain->chain; const struct nlattr * const *nla = ctx->nla; struct nft_chain_hook chain_hook = {}; - struct nft_base_chain *basechain; struct nft_hook *this, *hook; LIST_HEAD(chain_del_list); struct nft_trans *trans; int err; - if (!nft_is_base_chain(chain)) - return -EOPNOTSUPP; - - basechain = nft_base_chain(chain); err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) @@ -2769,7 +2907,12 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, if (chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; - return nft_delchain_hook(&ctx, chain, extack); + if (nft_is_base_chain(chain)) { + struct nft_base_chain *basechain = nft_base_chain(chain); + + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) + return nft_delchain_hook(&ctx, basechain, extack); + } } if (info->nlh->nlmsg_flags & NLM_F_NONREC && @@ -3490,8 +3633,7 @@ err_fill_rule_info: return err; } -static void nf_tables_rule_destroy(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr, *next; @@ -3508,7 +3650,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) +static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -3596,12 +3738,6 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, return 0; } -struct nft_set_elem_catchall { - struct list_head list; - struct rcu_head rcu; - void *elem; -}; - int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); @@ -3844,7 +3980,7 @@ err_destroy_flow_rule: if (flow) nft_flow_rule_destroy(flow); err_release_rule: - nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE); + nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); nf_tables_rule_destroy(&ctx, rule); err_release_expr: for (i = 0; i < n; i++) { @@ -4777,6 +4913,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; @@ -4785,6 +4924,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } @@ -4831,6 +4974,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); if (err < 0) return err; @@ -4934,7 +5080,7 @@ err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); err_set_destroy: - ops->destroy(set); + ops->destroy(&ctx, set); err_set_init: kfree(set->name); err_set_name: @@ -4949,7 +5095,7 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); - nft_set_elem_destroy(set, catchall->elem, true); + nf_tables_set_elem_destroy(ctx, set, catchall->elem); kfree_rcu(catchall, rcu); } } @@ -4964,7 +5110,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(ctx, set->exprs[i]); - set->ops->destroy(set); + set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); kfree(set->name); kvfree(set); @@ -5129,10 +5275,60 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, } } +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_activate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_activate(ctx->net, set, elem); + + return 0; +} + +static void nft_map_catchall_activate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + nft_setelem_data_activate(ctx->net, set, &elem); + break; + } +} + +static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_activate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_activate(ctx, set); +} + void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) { - if (nft_set_is_anonymous(set)) + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(ctx, set); + nft_clear(ctx->net, set); + } set->use++; } @@ -5143,14 +5339,28 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, enum nft_trans_phase phase) { switch (phase) { - case NFT_TRANS_PREPARE: + case NFT_TRANS_PREPARE_ERROR: + nft_set_trans_unbind(ctx, set); if (nft_set_is_anonymous(set)) nft_deactivate_next(ctx->net, set); set->use--; + break; + case NFT_TRANS_PREPARE: + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + + nft_deactivate_next(ctx->net, set); + } + set->use--; return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: + if (nft_set_is_anonymous(set) && + set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + set->use--; fallthrough; default: @@ -5903,6 +6113,7 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, __nft_set_elem_expr_destroy(ctx, expr); } +/* Drop references and destroy. Called from gc, dynset and abort path. */ void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -5924,11 +6135,11 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_setelem_data_deactivate() already deals - * with the refcounting from the preparation phase. +/* Destroy element. References have been already dropped in the preparation + * path via nft_setelem_data_deactivate(). */ -static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); @@ -6491,19 +6702,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (obj) { + *nft_set_ext_obj(ext) = obj; + obj->use++; + } if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; - goto err_elem_userdata; + goto err_elem_free; } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } - if (obj) { - *nft_set_ext_obj(ext) = obj; - obj->use++; - } err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) goto err_elem_free; @@ -6558,10 +6769,7 @@ err_set_full: err_element_clash: kfree(trans); err_elem_free: - if (obj) - obj->use--; -err_elem_userdata: - nf_tables_set_elem_destroy(ctx, set, elem.priv); + nft_set_elem_destroy(set, elem.priv, true); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); @@ -6605,7 +6813,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -6638,7 +6847,6 @@ static int nf_tables_newsetelem(struct sk_buff *skb, void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { struct nft_chain *chain; - struct nft_rule *rule; if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { @@ -6646,15 +6854,6 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) case NFT_GOTO: chain = data->verdict.chain; chain->use++; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use++; - list_for_each_entry(rule, &chain->rules, list) - chain->use++; - - nft_chain_add(chain->table, chain); break; } } @@ -6889,7 +7088,9 @@ static int nf_tables_delsetelem(struct sk_buff *skb, set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -7671,6 +7872,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: @@ -8943,7 +9145,7 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nft_commit_release(trans); } } @@ -9308,6 +9510,27 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } + list_for_each_entry(trans, &nft_net->binding_list, binding_list) { + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans)) && + !nft_trans_set_bound(trans)) { + pr_warn_once("nftables ruleset with unbound set\n"); + return -EINVAL; + } + break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans)) && + !nft_trans_chain_bound(trans)) { + pr_warn_once("nftables ruleset with unbound chain\n"); + return -EINVAL; + } + break; + } + } + /* 0. Validate ruleset, otherwise roll back for error reporting. */ if (nf_tables_validate(net) < 0) return -EAGAIN; @@ -9677,7 +9900,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { - if (nft_chain_is_bound(trans->ctx.chain)) { + if (nft_trans_chain_bound(trans)) { nft_trans_destroy(trans); break; } @@ -9700,6 +9923,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: + if (nft_trans_rule_bound(trans)) { + nft_trans_destroy(trans); + break; + } trans->ctx.chain->use--; list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_expr_deactivate(&trans->ctx, @@ -9734,6 +9961,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSET: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_set(trans)); + if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(&trans->ctx, nft_trans_set(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -9814,7 +10044,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nf_tables_abort_release(trans); } @@ -10263,22 +10493,12 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { struct nft_chain *chain; - struct nft_rule *rule; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; chain->use--; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use--; - list_for_each_entry(rule, &chain->rules, list) - chain->use--; - - nft_chain_del(chain); break; } } @@ -10513,6 +10733,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table) list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); table->use--; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(&ctx, set); + nft_set_destroy(&ctx, set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { @@ -10597,6 +10820,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index ee6840bd5933..8f1bfa6ccc2d 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -439,3 +439,4 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c9d2f7c29f53..3d76ebfe8939 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -76,11 +76,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx, switch (priv->data.verdict.code) { case NFT_JUMP: case NFT_GOTO: - if (nft_chain_is_bound(chain)) { - err = -EBUSY; - goto err1; - } - chain->bound = true; + err = nf_tables_bind_chain(ctx, chain); + if (err < 0) + return err; break; default: break; @@ -98,6 +96,31 @@ static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_activate(&chain_ctx, rule); + + nft_clear(ctx->net, chain); + break; + default: + break; + } + } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } @@ -107,6 +130,43 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_deactivate(&chain_ctx, rule, phase); + + switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nf_tables_unbind_chain(ctx, chain); + fallthrough; + case NFT_TRANS_PREPARE: + nft_deactivate_next(ctx->net, chain); + break; + default: + nft_chain_del(chain); + chain->bound = false; + chain->table->use--; + break; + } + break; + default: + break; + } + } if (phase == NFT_TRANS_COMMIT) return; @@ -131,15 +191,27 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, case NFT_GOTO: chain = data->verdict.chain; - if (!nft_chain_is_bound(chain)) + if (!nft_chain_binding(chain)) + break; + + /* Rule construction failed, but chain is already bound: + * let the transaction records release this chain and its rules. + */ + if (chain->bound) { + chain->use--; break; + } + /* Rule has been deleted, release chain and its rules. */ chain_ctx = *ctx; chain_ctx.chain = chain; - list_for_each_entry_safe(rule, n, &chain->rules, list) - nf_tables_rule_release(&chain_ctx, rule); - + chain->use--; + list_for_each_entry_safe(rule, n, &chain->rules, list) { + chain->use--; + list_del(&rule->list); + nf_tables_rule_destroy(&chain_ctx, rule); + } nf_tables_chain_destroy(&chain_ctx); break; default: diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 96081ac8d2b4..1e5e7a181e0b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -271,13 +271,14 @@ static int nft_bitmap_init(const struct nft_set *set, return 0; } -static void nft_bitmap_destroy(const struct nft_set *set) +static void nft_bitmap_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nft_set_elem_destroy(set, be, true); + nf_tables_set_elem_destroy(ctx, set, be); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 76de6c8d9865..0b73cb0e752f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -400,19 +400,31 @@ static int nft_rhash_init(const struct nft_set *set, return 0; } +struct nft_rhash_ctx { + const struct nft_ctx ctx; + const struct nft_set *set; +}; + static void nft_rhash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy(arg, ptr, true); + struct nft_rhash_ctx *rhash_ctx = arg; + + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr); } -static void nft_rhash_destroy(const struct nft_set *set) +static void nft_rhash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_ctx rhash_ctx = { + .ctx = *ctx, + .set = set, + }; cancel_delayed_work_sync(&priv->gc_work); rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, - (void *)set); + (void *)&rhash_ctx); } /* Number of buckets is stored in u32, so cap our result to 1U<<31 */ @@ -643,7 +655,8 @@ static int nft_hash_init(const struct nft_set *set, return 0; } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_hash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; @@ -653,7 +666,7 @@ static void nft_hash_destroy(const struct nft_set *set) for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nft_set_elem_destroy(set, he, true); + nf_tables_set_elem_destroy(ctx, set, he); } } } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 15e451dc3fc4..0452ee586c1c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1974,12 +1974,16 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); struct nft_pipapo_match *m; struct nft_pipapo_field *f; int i, r; rcu_read_lock(); - m = rcu_dereference(priv->match); + if (iter->genmask == nft_genmask_cur(net)) + m = rcu_dereference(priv->match); + else + m = priv->clone; if (unlikely(!m)) goto out; @@ -2148,10 +2152,12 @@ out_scratch: /** * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array + * @ctx: context * @set: nftables API set representation * @m: matching data pointing to key mapping array */ -static void nft_set_pipapo_match_destroy(const struct nft_set *set, +static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo_field *f; @@ -2168,15 +2174,17 @@ static void nft_set_pipapo_match_destroy(const struct nft_set *set, e = f->mt[r].e; - nft_set_elem_destroy(set, e, true); + nf_tables_set_elem_destroy(ctx, set, e); } } /** * nft_pipapo_destroy() - Free private data for set and all committed elements + * @ctx: context * @set: nftables API set representation */ -static void nft_pipapo_destroy(const struct nft_set *set) +static void nft_pipapo_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; @@ -2186,7 +2194,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) if (m) { rcu_barrier(); - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(ctx, set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(m->scratch_aligned); @@ -2203,7 +2211,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) m = priv->clone; if (priv->dirty) - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(ctx, set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(priv->clone->scratch_aligned); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2f114aa10f1a..5c05c9b990fb 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -664,7 +664,8 @@ static int nft_rbtree_init(const struct nft_set *set, return 0; } -static void nft_rbtree_destroy(const struct nft_set *set) +static void nft_rbtree_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; @@ -675,7 +676,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe, true); + nf_tables_set_elem_destroy(ctx, set, rbe); } } diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index e1990baf3a3b..dc9485854002 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); MODULE_ALIAS("ipt_osf"); MODULE_ALIAS("ip6t_osf"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 6ef3021e1169..e79be1b3e74d 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -966,6 +966,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, if (ret < 0) return ret; + sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; old_loss_model = q->loss_model; @@ -974,7 +975,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); if (ret) { q->loss_model = old_loss_model; - return ret; + goto unlock; } } else { q->loss_model = CLG_RANDOM; @@ -1041,6 +1042,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); +unlock: + sch_tree_unlock(sch); return ret; get_table_failure: @@ -1050,7 +1053,8 @@ get_table_failure: */ q->clg = old_clg; q->loss_model = old_loss_model; - return ret; + + goto unlock; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 79967b6925bd..e7c101290425 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -109,15 +109,15 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp) switch (*ip) { case SVC_POOL_AUTO: - return strlcpy(buf, "auto\n", 20); + return sysfs_emit(buf, "auto\n"); case SVC_POOL_GLOBAL: - return strlcpy(buf, "global\n", 20); + return sysfs_emit(buf, "global\n"); case SVC_POOL_PERCPU: - return strlcpy(buf, "percpu\n", 20); + return sysfs_emit(buf, "percpu\n"); case SVC_POOL_PERNODE: - return strlcpy(buf, "pernode\n", 20); + return sysfs_emit(buf, "pernode\n"); default: - return sprintf(buf, "%d\n", *ip); + return sysfs_emit(buf, "%d\n", *ip); } } @@ -597,34 +597,25 @@ svc_destroy(struct kref *ref) } EXPORT_SYMBOL_GPL(svc_destroy); -/* - * Allocate an RPC server's buffer space. - * We allocate pages and place them in rq_pages. - */ -static int +static bool svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) { - unsigned int pages, arghi; + unsigned long pages, ret; /* bc_xprt uses fore channel allocated buffers */ if (svc_is_backchannel(rqstp)) - return 1; + return true; pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply. * We assume one is at most one page */ - arghi = 0; WARN_ON_ONCE(pages > RPCSVC_MAXPAGES); if (pages > RPCSVC_MAXPAGES) pages = RPCSVC_MAXPAGES; - while (pages) { - struct page *p = alloc_pages_node(node, GFP_KERNEL, 0); - if (!p) - break; - rqstp->rq_pages[arghi++] = p; - pages--; - } - return pages == 0; + + ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages, + rqstp->rq_pages); + return ret == pages; } /* @@ -1173,6 +1164,7 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi */ static void svc_unregister(const struct svc_serv *serv, struct net *net) { + struct sighand_struct *sighand; struct svc_program *progp; unsigned long flags; unsigned int i; @@ -1189,9 +1181,12 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net) } } - spin_lock_irqsave(¤t->sighand->siglock, flags); + rcu_read_lock(); + sighand = rcu_dereference(current->sighand); + spin_lock_irqsave(&sighand->siglock, flags); recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); + spin_unlock_irqrestore(&sighand->siglock, flags); + rcu_read_unlock(); } /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 13a14897bc17..62c7919ea610 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -74,13 +74,18 @@ static LIST_HEAD(svc_xprt_class_list); * that no other thread will be using the transport or will * try to set XPT_DEAD. */ + +/** + * svc_reg_xprt_class - Register a server-side RPC transport class + * @xcl: New transport class to be registered + * + * Returns zero on success; otherwise a negative errno is returned. + */ int svc_reg_xprt_class(struct svc_xprt_class *xcl) { struct svc_xprt_class *cl; int res = -EEXIST; - dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name); - INIT_LIST_HEAD(&xcl->xcl_list); spin_lock(&svc_xprt_class_lock); /* Make sure there isn't already a class with the same name */ @@ -96,9 +101,13 @@ out: } EXPORT_SYMBOL_GPL(svc_reg_xprt_class); +/** + * svc_unreg_xprt_class - Unregister a server-side RPC transport class + * @xcl: Transport class to be unregistered + * + */ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) { - dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); spin_lock(&svc_xprt_class_lock); list_del_init(&xcl->xcl_list); spin_unlock(&svc_xprt_class_lock); @@ -685,8 +694,9 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) } for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk_array_node(GFP_KERNEL, + rqstp->rq_pool->sp_id, + pages, rqstp->rq_pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; @@ -843,15 +853,11 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) svc_xprt_received(xprt); } else if (svc_xprt_reserve_slot(rqstp, xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ - dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", - rqstp, rqstp->rq_pool->sp_id, xprt, - kref_read(&xprt->xpt_ref)); rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) len = svc_deferred_recv(rqstp); else len = xprt->xpt_ops->xpo_recvfrom(rqstp); - rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } else @@ -894,6 +900,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) err = -EAGAIN; if (len <= 0) goto out_release; + trace_svc_xdr_recvfrom(&rqstp->rq_arg); clear_bit(XPT_OLD, &xprt->xpt_flags); @@ -902,6 +909,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; + rqstp->rq_stime = ktime_get(); return len; out_release: rqstp->rq_res.len = 0; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f77cebe2c071..5f519fc0541b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -826,12 +826,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) trace_sk_data_ready(sk); - if (svsk) { - /* Refer to svc_setup_socket() for details. */ - rmb(); - svsk->sk_odata(sk); - } - /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -840,13 +834,18 @@ static void svc_tcp_listen_data_ready(struct sock *sk) * when one of child sockets become ESTABLISHED. * 2) data_ready method of the child socket may be called * when it receives data before the socket is accepted. - * In case of 2, we should ignore it silently. + * In case of 2, we should ignore it silently and DO NOT + * dereference svsk. */ - if (sk->sk_state == TCP_LISTEN) { - if (svsk) { - set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } + if (sk->sk_state != TCP_LISTEN) + return; + + if (svsk) { + /* Refer to svc_setup_socket() for details. */ + rmb(); + svsk->sk_odata(sk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } } @@ -887,13 +886,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); if (err < 0) { - if (err == -ENOMEM) - printk(KERN_WARNING "%s: no more sockets!\n", - serv->sv_name); - else if (err != -EAGAIN) - net_warn_ratelimited("%s: accept failed (err %d)!\n", - serv->sv_name, -err); - trace_svcsock_accept_err(xprt, serv->sv_name, err); + if (err != -EAGAIN) + trace_svcsock_accept_err(xprt, serv->sv_name, err); return NULL; } if (IS_ERR(sock_alloc_file(newsock, O_NONBLOCK, NULL))) @@ -1464,7 +1458,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_owspace = inet->sk_write_space; /* * This barrier is necessary in order to prevent race condition - * with svc_data_ready(), svc_listen_data_ready() and others + * with svc_data_ready(), svc_tcp_listen_data_ready(), and others * when calling callbacks above. */ wmb(); @@ -1476,7 +1470,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - trace_svcsock_new_socket(sock); + trace_svcsock_new(svsk, sock); return svsk; } @@ -1657,6 +1651,8 @@ static void svc_sock_free(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct socket *sock = svsk->sk_sock; + trace_svcsock_free(svsk, sock); + tls_handshake_cancel(sock->sk); if (sock->file) sockfd_put(sock); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 36835b2f5446..2a22e78af116 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1070,22 +1070,22 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) } EXPORT_SYMBOL_GPL(xdr_reserve_space); - /** * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending * @xdr: pointer to xdr_stream - * @vec: pointer to a kvec array * @nbytes: number of bytes to reserve * - * Reserves enough buffer space to encode 'nbytes' of data and stores the - * pointers in 'vec'. The size argument passed to xdr_reserve_space() is - * determined based on the number of bytes remaining in the current page to - * avoid invalidating iov_base pointers when xdr_commit_encode() is called. + * The size argument passed to xdr_reserve_space() is determined based + * on the number of bytes remaining in the current page to avoid + * invalidating iov_base pointers when xdr_commit_encode() is called. + * + * Return values: + * %0: success + * %-EMSGSIZE: not enough space is available in @xdr */ -int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes) +int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes) { - int thislen; - int v = 0; + size_t thislen; __be32 *p; /* @@ -1097,21 +1097,19 @@ int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbyte xdr->end = xdr->p; } + /* XXX: Let's find a way to make this more efficient */ while (nbytes) { thislen = xdr->buf->page_len % PAGE_SIZE; thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen); p = xdr_reserve_space(xdr, thislen); if (!p) - return -EIO; + return -EMSGSIZE; - vec[v].iov_base = p; - vec[v].iov_len = thislen; - v++; nbytes -= thislen; } - return v; + return 0; } EXPORT_SYMBOL_GPL(xdr_reserve_space_vec); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index aa2227a7e552..7420a2c990c7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -93,13 +93,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, */ get_page(virt_to_page(rqst->rq_buffer)); sctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(rdma, sctxt); - if (ret < 0) - return ret; - - ret = wait_for_completion_killable(&sctxt->sc_done); - svc_rdma_send_ctxt_put(rdma, sctxt); - return ret; + return svc_rdma_send(rdma, sctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index a22fe7587fa6..85c8bcaebb80 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -125,14 +125,15 @@ static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { + int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_recv_ctxt *ctxt; dma_addr_t addr; void *buffer; - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node); if (!ctxt) goto fail0; - buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; addr = ib_dma_map_single(rdma->sc_pd->device, buffer, @@ -155,7 +156,6 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->rc_recv_sge.length = rdma->sc_max_req_size; ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; ctxt->rc_recv_buf = buffer; - ctxt->rc_temp = false; return ctxt; fail2: @@ -232,10 +232,7 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, pcl_free(&ctxt->rc_write_pcl); pcl_free(&ctxt->rc_reply_pcl); - if (!ctxt->rc_temp) - llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); - else - svc_rdma_recv_ctxt_destroy(rdma, ctxt); + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); } /** @@ -258,7 +255,7 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt) } static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, - unsigned int wanted, bool temp) + unsigned int wanted) { const struct ib_recv_wr *bad_wr = NULL; struct svc_rdma_recv_ctxt *ctxt; @@ -275,7 +272,6 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, break; trace_svcrdma_post_recv(ctxt); - ctxt->rc_temp = temp; ctxt->rc_recv_wr.next = recv_chain; recv_chain = &ctxt->rc_recv_wr; rdma->sc_pending_recvs++; @@ -309,7 +305,7 @@ err_free: */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests); } /** @@ -343,7 +339,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) * client reconnects. */ if (rdma->sc_pending_recvs < rdma->sc_max_requests) - if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false)) + if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch)) goto dropped; /* All wc fields are now known to be valid */ @@ -775,9 +771,6 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, * * The next ctxt is removed from the "receive" lists. * - * - If the ctxt completes a Read, then finish assembling the Call - * message and return the number of bytes in the message. - * * - If the ctxt completes a Receive, then construct the Call * message from the contents of the Receive buffer. * @@ -786,7 +779,8 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, * in the message. * * - If there are Read chunks in this message, post Read WRs to - * pull that payload and return 0. + * pull that payload. When the Read WRs complete, build the + * full message and return the number of bytes in it. */ int svc_rdma_recvfrom(struct svc_rqst *rqstp) { @@ -796,6 +790,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *ctxt; int ret; + /* Prevent svc_xprt_release() from releasing pages in rq_pages + * when returning 0 or an error. + */ + rqstp->rq_respages = rqstp->rq_pages; + rqstp->rq_next_page = rqstp->rq_respages; + rqstp->rq_xprt_ctxt = NULL; ctxt = NULL; @@ -819,12 +819,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); - /* Prevent svc_xprt_release from releasing pages in rq_pages - * if we return 0 or an error. - */ - rqstp->rq_respages = rqstp->rq_pages; - rqstp->rq_next_page = rqstp->rq_respages; - ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 11cf7c646644..e460e25a1d6d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -62,8 +62,8 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), - GFP_KERNEL); + ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), + GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device)); if (!ctxt) goto out_noctx; @@ -84,8 +84,7 @@ out_noctx: return NULL; } -static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, - struct svc_rdma_rw_ctxt *ctxt, +static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); @@ -95,7 +94,7 @@ static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt) { - __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); + __svc_rdma_put_rw_ctxt(ctxt, &rdma->sc_rw_ctxts); } /** @@ -191,6 +190,8 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, struct svc_rdma_rw_ctxt *ctxt; LLIST_HEAD(free); + trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); @@ -198,7 +199,7 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + __svc_rdma_put_rw_ctxt(ctxt, &free); ctxt->rw_node.next = first; first = &ctxt->rw_node; @@ -234,7 +235,8 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, { struct svc_rdma_write_info *info; - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kmalloc_node(sizeof(*info), GFP_KERNEL, + ibdev_to_node(rdma->sc_cm_id->device)); if (!info) return info; @@ -304,7 +306,8 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) { struct svc_rdma_read_info *info; - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kmalloc_node(sizeof(*info), GFP_KERNEL, + ibdev_to_node(rdma->sc_cm_id->device)); if (!info) return info; @@ -351,8 +354,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) return; } -/* This function sleeps when the transport's Send Queue is congested. - * +/* * Assumptions: * - If ib_post_send() succeeds, only one completion is expected, * even if one or more WRs are flushed. This is true when posting @@ -367,6 +369,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) struct ib_cqe *cqe; int ret; + might_sleep(); + if (cc->cc_sqecount > rdma->sc_sq_depth) return -EINVAL; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 22a871e6fe4d..c6644cca52c5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -123,18 +123,17 @@ static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { + int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_send_ctxt *ctxt; dma_addr_t addr; void *buffer; - size_t size; int i; - size = sizeof(*ctxt); - size += rdma->sc_max_send_sges * sizeof(struct ib_sge); - ctxt = kmalloc(size, GFP_KERNEL); + ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges), + GFP_KERNEL, node); if (!ctxt) goto fail0; - buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; addr = ib_dma_map_single(rdma->sc_pd->device, buffer, @@ -148,7 +147,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; - init_completion(&ctxt->sc_done); ctxt->sc_cqe.done = svc_rdma_wc_send; ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, @@ -214,6 +212,7 @@ out: ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; + ctxt->sc_page_count = 0; return ctxt; out_empty: @@ -228,6 +227,8 @@ out_empty: * svc_rdma_send_ctxt_put - Return send_ctxt to free list * @rdma: controlling svcxprt_rdma * @ctxt: object to return to the free list + * + * Pages left in sc_pages are DMA unmapped and released. */ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) @@ -235,6 +236,9 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + if (ctxt->sc_page_count) + release_pages(ctxt->sc_pages, ctxt->sc_page_count); + /* The first SGE contains the transport header, which * remains mapped until @ctxt is destroyed. */ @@ -281,12 +285,12 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); svc_rdma_wake_send_waiters(rdma, 1); - complete(&ctxt->sc_done); if (unlikely(wc->status != IB_WC_SUCCESS)) goto flushed; trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + svc_rdma_send_ctxt_put(rdma, ctxt); return; flushed: @@ -294,6 +298,7 @@ flushed: trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid); else trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid); + svc_rdma_send_ctxt_put(rdma, ctxt); svc_xprt_deferred_close(&rdma->sc_xprt); } @@ -310,7 +315,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) struct ib_send_wr *wr = &ctxt->sc_send_wr; int ret; - reinit_completion(&ctxt->sc_done); + might_sleep(); /* Sync the transport header buffer */ ib_dma_sync_single_for_device(rdma->sc_pd->device, @@ -799,6 +804,25 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, svc_rdma_xb_dma_map, &args); } +/* The svc_rqst and all resources it owns are released as soon as + * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt + * so they are released by the Send completion handler. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + struct svc_rdma_send_ctxt *ctxt) +{ + int i, pages = rqstp->rq_next_page - rqstp->rq_respages; + + ctxt->sc_page_count += pages; + for (i = 0; i < pages; i++) { + ctxt->sc_pages[i] = rqstp->rq_respages[i]; + rqstp->rq_respages[i] = NULL; + } + + /* Prevent svc_xprt_release from releasing pages in rq_pages */ + rqstp->rq_next_page = rqstp->rq_respages; +} + /* Prepare the portion of the RPC Reply that will be transmitted * via RDMA Send. The RPC-over-RDMA transport header is prepared * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. @@ -828,6 +852,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, if (ret < 0) return ret; + svc_rdma_save_io_pages(rqstp, sctxt); + if (rctxt->rc_inv_rkey) { sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; @@ -835,13 +861,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.opcode = IB_WR_SEND; } - ret = svc_rdma_send(rdma, sctxt); - if (ret < 0) - return ret; - - ret = wait_for_completion_killable(&sctxt->sc_done); - svc_rdma_send_ctxt_put(rdma, sctxt); - return ret; + return svc_rdma_send(rdma, sctxt); } /** @@ -907,8 +927,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; if (svc_rdma_send(rdma, sctxt)) goto put_ctxt; - - wait_for_completion_killable(&sctxt->sc_done); + return; put_ctxt: svc_rdma_send_ctxt_put(rdma, sctxt); @@ -976,17 +995,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) goto put_ctxt; - - /* Prevent svc_xprt_release() from releasing the page backing - * rq_res.head[0].iov_base. It's no longer being accessed by - * the I/O device. */ - rqstp->rq_respages++; return 0; reply_chunk: if (ret != -E2BIG && ret != -EINVAL) goto put_ctxt; + /* Send completion releases payload pages that were part + * of previously posted RDMA Writes. + */ + svc_rdma_save_io_pages(rqstp, sctxt); svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ca04f7a6a085..2abd895046ee 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -64,7 +64,7 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, - struct net *net); + struct net *net, int node); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -123,14 +123,14 @@ static void qp_event_handler(struct ib_event *event, void *context) } static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, - struct net *net) + struct net *net, int node) { - struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); + struct svcxprt_rdma *cma_xprt; - if (!cma_xprt) { - dprintk("svcrdma: failed to create new transport\n"); + cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node); + if (!cma_xprt) return NULL; - } + svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); @@ -193,9 +193,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, struct svcxprt_rdma *newxprt; struct sockaddr *sa; - /* Create a new transport */ newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, - listen_xprt->sc_xprt.xpt_net); + listen_xprt->sc_xprt.xpt_net, + ibdev_to_node(new_cma_id->device)); if (!newxprt) return; newxprt->sc_cm_id = new_cma_id; @@ -304,7 +304,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) return ERR_PTR(-EAFNOSUPPORT); - cma_xprt = svc_rdma_create_xprt(serv, net); + cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE); if (!cma_xprt) return ERR_PTR(-ENOMEM); set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 39fb91ff23d9..815b38080401 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -131,6 +131,7 @@ struct sec_path *secpath_set(struct sk_buff *skb) memset(sp->ovec, 0, sizeof(sp->ovec)); sp->olen = 0; sp->len = 0; + sp->verified_cnt = 0; return sp; } @@ -330,11 +331,10 @@ xfrm_inner_mode_encap_remove(struct xfrm_state *x, { switch (x->props.mode) { case XFRM_MODE_BEET: - switch (XFRM_MODE_SKB_CB(skb)->protocol) { - case IPPROTO_IPIP: - case IPPROTO_BEETPH: + switch (x->sel.family) { + case AF_INET: return xfrm4_remove_beet_encap(x, skb); - case IPPROTO_IPV6: + case AF_INET6: return xfrm6_remove_beet_encap(x, skb); } break; diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 1f99dc469027..35279c220bd7 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } +static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type, unsigned short family) +{ + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (sp && (sp->len || sp->olen) && + !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + goto discard; + + XFRM_SPI_SKB_CB(skb)->family = family; + if (family == AF_INET) { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + } else { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + } + + return xfrm_input(skb, nexthdr, spi, encap_type); +discard: + kfree_skb(skb); + return 0; +} + +static int xfrmi4_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); +} + +static int xfrmi6_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0, 0, AF_INET6); +} + +static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); +} + +static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); +} + static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -945,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, + .handler = xfrmi6_rcv, + .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -996,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, + .handler = xfrmi4_rcv, + .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6d15788b5123..e7617c9959c3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1831,6 +1831,7 @@ again: __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); @@ -1869,6 +1870,7 @@ again: __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); @@ -3349,6 +3351,13 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { + if (idx < sp->verified_cnt) { + /* Secpath entry previously verified, consider optional and + * continue searching + */ + continue; + } + if (start == -1) start = -2-idx; break; @@ -3723,6 +3732,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. + * Upon success, marks secpath entries as having been + * verified to allow them to be skipped in future policy + * checks (e.g. nested tunnels). */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); @@ -3741,6 +3753,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_pols_put(pols, npols); + sp->verified_cnt = k; + return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); diff --git a/rust/alloc/README.md b/rust/alloc/README.md index c89c753720b5..eb6f22e94ebf 100644 --- a/rust/alloc/README.md +++ b/rust/alloc/README.md @@ -10,6 +10,9 @@ upstream. In general, only additions should be performed (e.g. new methods). Eventually, changes should make it into upstream so that, at some point, this fork can be dropped from the kernel tree. +The Rust upstream version on top of which these files are based matches +the output of `scripts/min-tool-version.sh rustc`. + ## Rationale diff --git a/rust/alloc/alloc.rs b/rust/alloc/alloc.rs index ca224a541770..acf22d45e6f2 100644 --- a/rust/alloc/alloc.rs +++ b/rust/alloc/alloc.rs @@ -22,21 +22,24 @@ use core::marker::Destruct; mod tests; extern "Rust" { - // These are the magic symbols to call the global allocator. rustc generates + // These are the magic symbols to call the global allocator. rustc generates // them to call `__rg_alloc` etc. if there is a `#[global_allocator]` attribute // (the code expanding that attribute macro generates those functions), or to call - // the default implementations in libstd (`__rdl_alloc` etc. in `library/std/src/alloc.rs`) + // the default implementations in std (`__rdl_alloc` etc. in `library/std/src/alloc.rs`) // otherwise. - // The rustc fork of LLVM also special-cases these function names to be able to optimize them + // The rustc fork of LLVM 14 and earlier also special-cases these function names to be able to optimize them // like `malloc`, `realloc`, and `free`, respectively. #[rustc_allocator] - #[rustc_allocator_nounwind] + #[rustc_nounwind] fn __rust_alloc(size: usize, align: usize) -> *mut u8; - #[rustc_allocator_nounwind] + #[rustc_deallocator] + #[rustc_nounwind] fn __rust_dealloc(ptr: *mut u8, size: usize, align: usize); - #[rustc_allocator_nounwind] + #[rustc_reallocator] + #[rustc_nounwind] fn __rust_realloc(ptr: *mut u8, old_size: usize, align: usize, new_size: usize) -> *mut u8; - #[rustc_allocator_nounwind] + #[rustc_allocator_zeroed] + #[rustc_nounwind] fn __rust_alloc_zeroed(size: usize, align: usize) -> *mut u8; } @@ -72,11 +75,14 @@ pub use std::alloc::Global; /// # Examples /// /// ``` -/// use std::alloc::{alloc, dealloc, Layout}; +/// use std::alloc::{alloc, dealloc, handle_alloc_error, Layout}; /// /// unsafe { /// let layout = Layout::new::<u16>(); /// let ptr = alloc(layout); +/// if ptr.is_null() { +/// handle_alloc_error(layout); +/// } /// /// *(ptr as *mut u16) = 42; /// assert_eq!(*(ptr as *mut u16), 42); @@ -349,7 +355,7 @@ pub(crate) const unsafe fn box_free<T: ?Sized, A: ~const Allocator + ~const Dest #[cfg(not(no_global_oom_handling))] extern "Rust" { - // This is the magic symbol to call the global alloc error handler. rustc generates + // This is the magic symbol to call the global alloc error handler. rustc generates // it to call `__rg_oom` if there is a `#[alloc_error_handler]`, or to call the // default implementations below (`__rdl_oom`) otherwise. fn __rust_alloc_error_handler(size: usize, align: usize) -> !; @@ -394,25 +400,24 @@ pub use std::alloc::handle_alloc_error; #[allow(unused_attributes)] #[unstable(feature = "alloc_internals", issue = "none")] pub mod __alloc_error_handler { - use crate::alloc::Layout; - - // called via generated `__rust_alloc_error_handler` - - // if there is no `#[alloc_error_handler]` + // called via generated `__rust_alloc_error_handler` if there is no + // `#[alloc_error_handler]`. #[rustc_std_internal_symbol] - pub unsafe extern "C-unwind" fn __rdl_oom(size: usize, _align: usize) -> ! { - panic!("memory allocation of {size} bytes failed") - } - - // if there is an `#[alloc_error_handler]` - #[rustc_std_internal_symbol] - pub unsafe extern "C-unwind" fn __rg_oom(size: usize, align: usize) -> ! { - let layout = unsafe { Layout::from_size_align_unchecked(size, align) }; + pub unsafe fn __rdl_oom(size: usize, _align: usize) -> ! { extern "Rust" { - #[lang = "oom"] - fn oom_impl(layout: Layout) -> !; + // This symbol is emitted by rustc next to __rust_alloc_error_handler. + // Its value depends on the -Zoom={panic,abort} compiler option. + static __rust_alloc_error_handler_should_panic: u8; + } + + #[allow(unused_unsafe)] + if unsafe { __rust_alloc_error_handler_should_panic != 0 } { + panic!("memory allocation of {size} bytes failed") + } else { + core::panicking::panic_nounwind_fmt(format_args!( + "memory allocation of {size} bytes failed" + )) } - unsafe { oom_impl(layout) } } } diff --git a/rust/alloc/boxed.rs b/rust/alloc/boxed.rs index dcfe87b14f3a..14af9860c36c 100644 --- a/rust/alloc/boxed.rs +++ b/rust/alloc/boxed.rs @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT -//! A pointer type for heap allocation. +//! The `Box<T>` type for heap allocation. //! //! [`Box<T>`], casually referred to as a 'box', provides the simplest form of //! heap allocation in Rust. Boxes provide ownership for this allocation, and @@ -124,7 +124,21 @@ //! definition is just using `T*` can lead to undefined behavior, as //! described in [rust-lang/unsafe-code-guidelines#198][ucg#198]. //! +//! # Considerations for unsafe code +//! +//! **Warning: This section is not normative and is subject to change, possibly +//! being relaxed in the future! It is a simplified summary of the rules +//! currently implemented in the compiler.** +//! +//! The aliasing rules for `Box<T>` are the same as for `&mut T`. `Box<T>` +//! asserts uniqueness over its content. Using raw pointers derived from a box +//! after that box has been mutated through, moved or borrowed as `&mut T` +//! is not allowed. For more guidance on working with box from unsafe code, see +//! [rust-lang/unsafe-code-guidelines#326][ucg#326]. +//! +//! //! [ucg#198]: https://github.com/rust-lang/unsafe-code-guidelines/issues/198 +//! [ucg#326]: https://github.com/rust-lang/unsafe-code-guidelines/issues/326 //! [dereferencing]: core::ops::Deref //! [`Box::<T>::from_raw(value)`]: Box::from_raw //! [`Global`]: crate::alloc::Global @@ -139,12 +153,14 @@ use core::async_iter::AsyncIterator; use core::borrow; use core::cmp::Ordering; use core::convert::{From, TryFrom}; +use core::error::Error; use core::fmt; use core::future::Future; use core::hash::{Hash, Hasher}; #[cfg(not(no_global_oom_handling))] use core::iter::FromIterator; use core::iter::{FusedIterator, Iterator}; +use core::marker::Tuple; use core::marker::{Destruct, Unpin, Unsize}; use core::mem; use core::ops::{ @@ -163,6 +179,8 @@ use crate::raw_vec::RawVec; #[cfg(not(no_global_oom_handling))] use crate::str::from_boxed_utf8_unchecked; #[cfg(not(no_global_oom_handling))] +use crate::string::String; +#[cfg(not(no_global_oom_handling))] use crate::vec::Vec; #[cfg(not(no_thin))] @@ -172,7 +190,7 @@ pub use thin::ThinBox; #[cfg(not(no_thin))] mod thin; -/// A pointer type for heap allocation. +/// A pointer type that uniquely owns a heap allocation of type `T`. /// /// See the [module-level documentation](../../std/boxed/index.html) for more. #[lang = "owned_box"] @@ -196,12 +214,13 @@ impl<T> Box<T> { /// ``` /// let five = Box::new(5); /// ``` - #[cfg(not(no_global_oom_handling))] + #[cfg(all(not(no_global_oom_handling)))] #[inline(always)] #[stable(feature = "rust1", since = "1.0.0")] #[must_use] pub fn new(x: T) -> Self { - box x + #[rustc_box] + Box::new(x) } /// Constructs a new box with uninitialized contents. @@ -256,14 +275,21 @@ impl<T> Box<T> { Self::new_zeroed_in(Global) } - /// Constructs a new `Pin<Box<T>>`. If `T` does not implement `Unpin`, then + /// Constructs a new `Pin<Box<T>>`. If `T` does not implement [`Unpin`], then /// `x` will be pinned in memory and unable to be moved. + /// + /// Constructing and pinning of the `Box` can also be done in two steps: `Box::pin(x)` + /// does the same as <code>[Box::into_pin]\([Box::new]\(x))</code>. Consider using + /// [`into_pin`](Box::into_pin) if you already have a `Box<T>`, or if you want to + /// construct a (pinned) `Box` in a different way than with [`Box::new`]. #[cfg(not(no_global_oom_handling))] #[stable(feature = "pin", since = "1.33.0")] #[must_use] #[inline(always)] pub fn pin(x: T) -> Pin<Box<T>> { - (box x).into() + (#[rustc_box] + Box::new(x)) + .into() } /// Allocates memory on the heap then places `x` into it, @@ -543,8 +569,13 @@ impl<T, A: Allocator> Box<T, A> { unsafe { Ok(Box::from_raw_in(ptr.as_ptr(), alloc)) } } - /// Constructs a new `Pin<Box<T, A>>`. If `T` does not implement `Unpin`, then + /// Constructs a new `Pin<Box<T, A>>`. If `T` does not implement [`Unpin`], then /// `x` will be pinned in memory and unable to be moved. + /// + /// Constructing and pinning of the `Box` can also be done in two steps: `Box::pin_in(x, alloc)` + /// does the same as <code>[Box::into_pin]\([Box::new_in]\(x, alloc))</code>. Consider using + /// [`into_pin`](Box::into_pin) if you already have a `Box<T, A>`, or if you want to + /// construct a (pinned) `Box` in a different way than with [`Box::new_in`]. #[cfg(not(no_global_oom_handling))] #[unstable(feature = "allocator_api", issue = "32838")] #[rustc_const_unstable(feature = "const_box", issue = "92521")] @@ -926,6 +957,7 @@ impl<T: ?Sized> Box<T> { /// [`Layout`]: crate::Layout #[stable(feature = "box_raw", since = "1.4.0")] #[inline] + #[must_use = "call `drop(Box::from_raw(ptr))` if you intend to drop the `Box`"] pub unsafe fn from_raw(raw: *mut T) -> Self { unsafe { Self::from_raw_in(raw, Global) } } @@ -1160,19 +1192,44 @@ impl<T: ?Sized, A: Allocator> Box<T, A> { unsafe { &mut *mem::ManuallyDrop::new(b).0.as_ptr() } } - /// Converts a `Box<T>` into a `Pin<Box<T>>` + /// Converts a `Box<T>` into a `Pin<Box<T>>`. If `T` does not implement [`Unpin`], then + /// `*boxed` will be pinned in memory and unable to be moved. /// /// This conversion does not allocate on the heap and happens in place. /// /// This is also available via [`From`]. - #[unstable(feature = "box_into_pin", issue = "62370")] + /// + /// Constructing and pinning a `Box` with <code>Box::into_pin([Box::new]\(x))</code> + /// can also be written more concisely using <code>[Box::pin]\(x)</code>. + /// This `into_pin` method is useful if you already have a `Box<T>`, or you are + /// constructing a (pinned) `Box` in a different way than with [`Box::new`]. + /// + /// # Notes + /// + /// It's not recommended that crates add an impl like `From<Box<T>> for Pin<T>`, + /// as it'll introduce an ambiguity when calling `Pin::from`. + /// A demonstration of such a poor impl is shown below. + /// + /// ```compile_fail + /// # use std::pin::Pin; + /// struct Foo; // A type defined in this crate. + /// impl From<Box<()>> for Pin<Foo> { + /// fn from(_: Box<()>) -> Pin<Foo> { + /// Pin::new(Foo) + /// } + /// } + /// + /// let foo = Box::new(()); + /// let bar = Pin::from(foo); + /// ``` + #[stable(feature = "box_into_pin", since = "1.63.0")] #[rustc_const_unstable(feature = "const_box", issue = "92521")] pub const fn into_pin(boxed: Self) -> Pin<Self> where A: 'static, { // It's not possible to move or replace the insides of a `Pin<Box<T>>` - // when `T: !Unpin`, so it's safe to pin it directly without any + // when `T: !Unpin`, so it's safe to pin it directly without any // additional requirements. unsafe { Pin::new_unchecked(boxed) } } @@ -1190,7 +1247,8 @@ unsafe impl<#[may_dangle] T: ?Sized, A: Allocator> Drop for Box<T, A> { impl<T: Default> Default for Box<T> { /// Creates a `Box<T>`, with the `Default` value for T. fn default() -> Self { - box T::default() + #[rustc_box] + Box::new(T::default()) } } @@ -1408,9 +1466,17 @@ impl<T: ?Sized, A: Allocator> const From<Box<T, A>> for Pin<Box<T, A>> where A: 'static, { - /// Converts a `Box<T>` into a `Pin<Box<T>>` + /// Converts a `Box<T>` into a `Pin<Box<T>>`. If `T` does not implement [`Unpin`], then + /// `*boxed` will be pinned in memory and unable to be moved. /// /// This conversion does not allocate on the heap and happens in place. + /// + /// This is also available via [`Box::into_pin`]. + /// + /// Constructing and pinning a `Box` with <code><Pin<Box\<T>>>::from([Box::new]\(x))</code> + /// can also be written more concisely using <code>[Box::pin]\(x)</code>. + /// This `From` implementation is useful if you already have a `Box<T>`, or you are + /// constructing a (pinned) `Box` in a different way than with [`Box::new`]. fn from(boxed: Box<T, A>) -> Self { Box::into_pin(boxed) } @@ -1422,7 +1488,7 @@ impl<T: Copy> From<&[T]> for Box<[T]> { /// Converts a `&[T]` into a `Box<[T]>` /// /// This conversion allocates on the heap - /// and performs a copy of `slice`. + /// and performs a copy of `slice` and its contents. /// /// # Examples /// ```rust @@ -1554,10 +1620,27 @@ impl<T, const N: usize> From<[T; N]> for Box<[T]> { /// println!("{boxed:?}"); /// ``` fn from(array: [T; N]) -> Box<[T]> { - box array + #[rustc_box] + Box::new(array) } } +/// Casts a boxed slice to a boxed array. +/// +/// # Safety +/// +/// `boxed_slice.len()` must be exactly `N`. +unsafe fn boxed_slice_as_array_unchecked<T, A: Allocator, const N: usize>( + boxed_slice: Box<[T], A>, +) -> Box<[T; N], A> { + debug_assert_eq!(boxed_slice.len(), N); + + let (ptr, alloc) = Box::into_raw_with_allocator(boxed_slice); + // SAFETY: Pointer and allocator came from an existing box, + // and our safety condition requires that the length is exactly `N` + unsafe { Box::from_raw_in(ptr as *mut [T; N], alloc) } +} + #[stable(feature = "boxed_slice_try_from", since = "1.43.0")] impl<T, const N: usize> TryFrom<Box<[T]>> for Box<[T; N]> { type Error = Box<[T]>; @@ -1573,13 +1656,46 @@ impl<T, const N: usize> TryFrom<Box<[T]>> for Box<[T; N]> { /// `boxed_slice.len()` does not equal `N`. fn try_from(boxed_slice: Box<[T]>) -> Result<Self, Self::Error> { if boxed_slice.len() == N { - Ok(unsafe { Box::from_raw(Box::into_raw(boxed_slice) as *mut [T; N]) }) + Ok(unsafe { boxed_slice_as_array_unchecked(boxed_slice) }) } else { Err(boxed_slice) } } } +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "boxed_array_try_from_vec", since = "1.66.0")] +impl<T, const N: usize> TryFrom<Vec<T>> for Box<[T; N]> { + type Error = Vec<T>; + + /// Attempts to convert a `Vec<T>` into a `Box<[T; N]>`. + /// + /// Like [`Vec::into_boxed_slice`], this is in-place if `vec.capacity() == N`, + /// but will require a reallocation otherwise. + /// + /// # Errors + /// + /// Returns the original `Vec<T>` in the `Err` variant if + /// `boxed_slice.len()` does not equal `N`. + /// + /// # Examples + /// + /// This can be used with [`vec!`] to create an array on the heap: + /// + /// ``` + /// let state: Box<[f32; 100]> = vec![1.0; 100].try_into().unwrap(); + /// assert_eq!(state.len(), 100); + /// ``` + fn try_from(vec: Vec<T>) -> Result<Self, Self::Error> { + if vec.len() == N { + let boxed_slice = vec.into_boxed_slice(); + Ok(unsafe { boxed_slice_as_array_unchecked(boxed_slice) }) + } else { + Err(vec) + } + } +} + impl<A: Allocator> Box<dyn Any, A> { /// Attempt to downcast the box to a concrete type. /// @@ -1869,7 +1985,7 @@ impl<I: ExactSizeIterator + ?Sized, A: Allocator> ExactSizeIterator for Box<I, A impl<I: FusedIterator + ?Sized, A: Allocator> FusedIterator for Box<I, A> {} #[stable(feature = "boxed_closure_impls", since = "1.35.0")] -impl<Args, F: FnOnce<Args> + ?Sized, A: Allocator> FnOnce<Args> for Box<F, A> { +impl<Args: Tuple, F: FnOnce<Args> + ?Sized, A: Allocator> FnOnce<Args> for Box<F, A> { type Output = <F as FnOnce<Args>>::Output; extern "rust-call" fn call_once(self, args: Args) -> Self::Output { @@ -1878,20 +1994,20 @@ impl<Args, F: FnOnce<Args> + ?Sized, A: Allocator> FnOnce<Args> for Box<F, A> { } #[stable(feature = "boxed_closure_impls", since = "1.35.0")] -impl<Args, F: FnMut<Args> + ?Sized, A: Allocator> FnMut<Args> for Box<F, A> { +impl<Args: Tuple, F: FnMut<Args> + ?Sized, A: Allocator> FnMut<Args> for Box<F, A> { extern "rust-call" fn call_mut(&mut self, args: Args) -> Self::Output { <F as FnMut<Args>>::call_mut(self, args) } } #[stable(feature = "boxed_closure_impls", since = "1.35.0")] -impl<Args, F: Fn<Args> + ?Sized, A: Allocator> Fn<Args> for Box<F, A> { +impl<Args: Tuple, F: Fn<Args> + ?Sized, A: Allocator> Fn<Args> for Box<F, A> { extern "rust-call" fn call(&self, args: Args) -> Self::Output { <F as Fn<Args>>::call(self, args) } } -#[unstable(feature = "coerce_unsized", issue = "27732")] +#[unstable(feature = "coerce_unsized", issue = "18598")] impl<T: ?Sized + Unsize<U>, U: ?Sized, A: Allocator> CoerceUnsized<Box<U, A>> for Box<T, A> {} #[unstable(feature = "dispatch_from_dyn", issue = "none")] @@ -1973,8 +2089,7 @@ impl<T: ?Sized, A: Allocator> AsMut<T> for Box<T, A> { * could have a method to project a Pin<T> from it. */ #[stable(feature = "pin", since = "1.33.0")] -#[rustc_const_unstable(feature = "const_box", issue = "92521")] -impl<T: ?Sized, A: Allocator> const Unpin for Box<T, A> where A: 'static {} +impl<T: ?Sized, A: Allocator> Unpin for Box<T, A> where A: 'static {} #[unstable(feature = "generator_trait", issue = "43122")] impl<G: ?Sized + Generator<R> + Unpin, R, A: Allocator> Generator<R> for Box<G, A> @@ -2026,3 +2141,292 @@ impl<S: ?Sized + AsyncIterator + Unpin> AsyncIterator for Box<S> { (**self).size_hint() } } + +impl dyn Error { + #[inline] + #[stable(feature = "error_downcast", since = "1.3.0")] + #[rustc_allow_incoherent_impl] + /// Attempts to downcast the box to a concrete type. + pub fn downcast<T: Error + 'static>(self: Box<Self>) -> Result<Box<T>, Box<dyn Error>> { + if self.is::<T>() { + unsafe { + let raw: *mut dyn Error = Box::into_raw(self); + Ok(Box::from_raw(raw as *mut T)) + } + } else { + Err(self) + } + } +} + +impl dyn Error + Send { + #[inline] + #[stable(feature = "error_downcast", since = "1.3.0")] + #[rustc_allow_incoherent_impl] + /// Attempts to downcast the box to a concrete type. + pub fn downcast<T: Error + 'static>(self: Box<Self>) -> Result<Box<T>, Box<dyn Error + Send>> { + let err: Box<dyn Error> = self; + <dyn Error>::downcast(err).map_err(|s| unsafe { + // Reapply the `Send` marker. + mem::transmute::<Box<dyn Error>, Box<dyn Error + Send>>(s) + }) + } +} + +impl dyn Error + Send + Sync { + #[inline] + #[stable(feature = "error_downcast", since = "1.3.0")] + #[rustc_allow_incoherent_impl] + /// Attempts to downcast the box to a concrete type. + pub fn downcast<T: Error + 'static>(self: Box<Self>) -> Result<Box<T>, Box<Self>> { + let err: Box<dyn Error> = self; + <dyn Error>::downcast(err).map_err(|s| unsafe { + // Reapply the `Send + Sync` marker. + mem::transmute::<Box<dyn Error>, Box<dyn Error + Send + Sync>>(s) + }) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, E: Error + 'a> From<E> for Box<dyn Error + 'a> { + /// Converts a type of [`Error`] into a box of dyn [`Error`]. + /// + /// # Examples + /// + /// ``` + /// use std::error::Error; + /// use std::fmt; + /// use std::mem; + /// + /// #[derive(Debug)] + /// struct AnError; + /// + /// impl fmt::Display for AnError { + /// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + /// write!(f, "An error") + /// } + /// } + /// + /// impl Error for AnError {} + /// + /// let an_error = AnError; + /// assert!(0 == mem::size_of_val(&an_error)); + /// let a_boxed_error = Box::<dyn Error>::from(an_error); + /// assert!(mem::size_of::<Box<dyn Error>>() == mem::size_of_val(&a_boxed_error)) + /// ``` + fn from(err: E) -> Box<dyn Error + 'a> { + Box::new(err) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, E: Error + Send + Sync + 'a> From<E> for Box<dyn Error + Send + Sync + 'a> { + /// Converts a type of [`Error`] + [`Send`] + [`Sync`] into a box of + /// dyn [`Error`] + [`Send`] + [`Sync`]. + /// + /// # Examples + /// + /// ``` + /// use std::error::Error; + /// use std::fmt; + /// use std::mem; + /// + /// #[derive(Debug)] + /// struct AnError; + /// + /// impl fmt::Display for AnError { + /// fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + /// write!(f, "An error") + /// } + /// } + /// + /// impl Error for AnError {} + /// + /// unsafe impl Send for AnError {} + /// + /// unsafe impl Sync for AnError {} + /// + /// let an_error = AnError; + /// assert!(0 == mem::size_of_val(&an_error)); + /// let a_boxed_error = Box::<dyn Error + Send + Sync>::from(an_error); + /// assert!( + /// mem::size_of::<Box<dyn Error + Send + Sync>>() == mem::size_of_val(&a_boxed_error)) + /// ``` + fn from(err: E) -> Box<dyn Error + Send + Sync + 'a> { + Box::new(err) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl From<String> for Box<dyn Error + Send + Sync> { + /// Converts a [`String`] into a box of dyn [`Error`] + [`Send`] + [`Sync`]. + /// + /// # Examples + /// + /// ``` + /// use std::error::Error; + /// use std::mem; + /// + /// let a_string_error = "a string error".to_string(); + /// let a_boxed_error = Box::<dyn Error + Send + Sync>::from(a_string_error); + /// assert!( + /// mem::size_of::<Box<dyn Error + Send + Sync>>() == mem::size_of_val(&a_boxed_error)) + /// ``` + #[inline] + fn from(err: String) -> Box<dyn Error + Send + Sync> { + struct StringError(String); + + impl Error for StringError { + #[allow(deprecated)] + fn description(&self) -> &str { + &self.0 + } + } + + impl fmt::Display for StringError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } + } + + // Purposefully skip printing "StringError(..)" + impl fmt::Debug for StringError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.0, f) + } + } + + Box::new(StringError(err)) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "string_box_error", since = "1.6.0")] +impl From<String> for Box<dyn Error> { + /// Converts a [`String`] into a box of dyn [`Error`]. + /// + /// # Examples + /// + /// ``` + /// use std::error::Error; + /// use std::mem; + /// + /// let a_string_error = "a string error".to_string(); + /// let a_boxed_error = Box::<dyn Error>::from(a_string_error); + /// assert!(mem::size_of::<Box<dyn Error>>() == mem::size_of_val(&a_boxed_error)) + /// ``` + fn from(str_err: String) -> Box<dyn Error> { + let err1: Box<dyn Error + Send + Sync> = From::from(str_err); + let err2: Box<dyn Error> = err1; + err2 + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> From<&str> for Box<dyn Error + Send + Sync + 'a> { + /// Converts a [`str`] into a box of dyn [`Error`] + [`Send`] + [`Sync`]. + /// + /// [`str`]: prim@str + /// + /// # Examples + /// + /// ``` + /// use std::error::Error; + /// use std::mem; + /// + /// let a_str_error = "a str error"; + /// let a_boxed_error = Box::<dyn Error + Send + Sync>::from(a_str_error); + /// assert!( + /// mem::size_of::<Box<dyn Error + Send + Sync>>() == mem::size_of_val(&a_boxed_error)) + /// ``` + #[inline] + fn from(err: &str) -> Box<dyn Error + Send + Sync + 'a> { + From::from(String::from(err)) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "string_box_error", since = "1.6.0")] +impl From<&str> for Box<dyn Error> { + /// Converts a [`str`] into a box of dyn [`Error`]. + /// + /// [`str`]: prim@str + /// + /// # Examples + /// + /// ``` + /// use std::error::Error; + /// use std::mem; + /// + /// let a_str_error = "a str error"; + /// let a_boxed_error = Box::<dyn Error>::from(a_str_error); + /// assert!(mem::size_of::<Box<dyn Error>>() == mem::size_of_val(&a_boxed_error)) + /// ``` + fn from(err: &str) -> Box<dyn Error> { + From::from(String::from(err)) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_box_error", since = "1.22.0")] +impl<'a, 'b> From<Cow<'b, str>> for Box<dyn Error + Send + Sync + 'a> { + /// Converts a [`Cow`] into a box of dyn [`Error`] + [`Send`] + [`Sync`]. + /// + /// # Examples + /// + /// ``` + /// use std::error::Error; + /// use std::mem; + /// use std::borrow::Cow; + /// + /// let a_cow_str_error = Cow::from("a str error"); + /// let a_boxed_error = Box::<dyn Error + Send + Sync>::from(a_cow_str_error); + /// assert!( + /// mem::size_of::<Box<dyn Error + Send + Sync>>() == mem::size_of_val(&a_boxed_error)) + /// ``` + fn from(err: Cow<'b, str>) -> Box<dyn Error + Send + Sync + 'a> { + From::from(String::from(err)) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_box_error", since = "1.22.0")] +impl<'a> From<Cow<'a, str>> for Box<dyn Error> { + /// Converts a [`Cow`] into a box of dyn [`Error`]. + /// + /// # Examples + /// + /// ``` + /// use std::error::Error; + /// use std::mem; + /// use std::borrow::Cow; + /// + /// let a_cow_str_error = Cow::from("a str error"); + /// let a_boxed_error = Box::<dyn Error>::from(a_cow_str_error); + /// assert!(mem::size_of::<Box<dyn Error>>() == mem::size_of_val(&a_boxed_error)) + /// ``` + fn from(err: Cow<'a, str>) -> Box<dyn Error> { + From::from(String::from(err)) + } +} + +#[stable(feature = "box_error", since = "1.8.0")] +impl<T: core::error::Error> core::error::Error for Box<T> { + #[allow(deprecated, deprecated_in_future)] + fn description(&self) -> &str { + core::error::Error::description(&**self) + } + + #[allow(deprecated)] + fn cause(&self) -> Option<&dyn core::error::Error> { + core::error::Error::cause(&**self) + } + + fn source(&self) -> Option<&(dyn core::error::Error + 'static)> { + core::error::Error::source(&**self) + } +} diff --git a/rust/alloc/collections/mod.rs b/rust/alloc/collections/mod.rs index 1eec265b28f8..2506065d158a 100644 --- a/rust/alloc/collections/mod.rs +++ b/rust/alloc/collections/mod.rs @@ -141,7 +141,7 @@ impl Display for TryReserveError { " because the computed capacity exceeded the collection's maximum" } TryReserveErrorKind::AllocError { .. } => { - " because the memory allocator returned a error" + " because the memory allocator returned an error" } }; fmt.write_str(reason) @@ -154,3 +154,6 @@ trait SpecExtend<I: IntoIterator> { /// Extends `self` with the contents of the given iterator. fn spec_extend(&mut self, iter: I); } + +#[stable(feature = "try_reserve", since = "1.57.0")] +impl core::error::Error for TryReserveError {} diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs index 3aebf83c9967..5f374378b0d4 100644 --- a/rust/alloc/lib.rs +++ b/rust/alloc/lib.rs @@ -5,7 +5,7 @@ //! This library provides smart pointers and collections for managing //! heap-allocated values. //! -//! This library, like libcore, normally doesn’t need to be used directly +//! This library, like core, normally doesn’t need to be used directly //! since its contents are re-exported in the [`std` crate](../std/index.html). //! Crates that use the `#![no_std]` attribute however will typically //! not depend on `std`, so they’d use this crate instead. @@ -58,10 +58,6 @@ //! [`Rc`]: rc //! [`RefCell`]: core::cell -// To run liballoc tests without x.py without ending up with two copies of liballoc, Miri needs to be -// able to "empty" this crate. See <https://github.com/rust-lang/miri-test-libstd/issues/4>. -// rustc itself never sets the feature, so this line has no affect there. -#![cfg(any(not(feature = "miri-test-libstd"), test, doctest))] #![allow(unused_attributes)] #![stable(feature = "alloc", since = "1.36.0")] #![doc( @@ -75,23 +71,30 @@ any(not(feature = "miri-test-libstd"), test, doctest), no_global_oom_handling, not(no_global_oom_handling), + not(no_rc), + not(no_sync), target_has_atomic = "ptr" ))] #![no_std] #![needs_allocator] +// To run alloc tests without x.py without ending up with two copies of alloc, Miri needs to be +// able to "empty" this crate. See <https://github.com/rust-lang/miri-test-libstd/issues/4>. +// rustc itself never sets the feature, so this line has no affect there. +#![cfg(any(not(feature = "miri-test-libstd"), test, doctest))] // // Lints: #![deny(unsafe_op_in_unsafe_fn)] +#![deny(fuzzy_provenance_casts)] #![warn(deprecated_in_future)] #![warn(missing_debug_implementations)] #![warn(missing_docs)] #![allow(explicit_outlives_requirements)] // // Library features: -#![cfg_attr(not(no_global_oom_handling), feature(alloc_c_string))] #![feature(alloc_layout_extra)] #![feature(allocator_api)] #![feature(array_chunks)] +#![feature(array_into_iter_constructors)] #![feature(array_methods)] #![feature(array_windows)] #![feature(assert_matches)] @@ -99,39 +102,53 @@ #![feature(coerce_unsized)] #![cfg_attr(not(no_global_oom_handling), feature(const_alloc_error))] #![feature(const_box)] -#![cfg_attr(not(no_global_oom_handling), feature(const_btree_new))] +#![cfg_attr(not(no_global_oom_handling), feature(const_btree_len))] #![cfg_attr(not(no_borrow), feature(const_cow_is_borrowed))] #![feature(const_convert)] #![feature(const_size_of_val)] #![feature(const_align_of_val)] #![feature(const_ptr_read)] +#![feature(const_maybe_uninit_zeroed)] #![feature(const_maybe_uninit_write)] #![feature(const_maybe_uninit_as_mut_ptr)] #![feature(const_refs_to_cell)] -#![feature(core_c_str)] #![feature(core_intrinsics)] -#![feature(core_ffi_c)] +#![feature(core_panic)] #![feature(const_eval_select)] #![feature(const_pin)] +#![feature(const_waker)] #![feature(cstr_from_bytes_until_nul)] #![feature(dispatch_from_dyn)] +#![feature(error_generic_member_access)] +#![feature(error_in_core)] #![feature(exact_size_is_empty)] #![feature(extend_one)] #![feature(fmt_internals)] #![feature(fn_traits)] #![feature(hasher_prefixfree_extras)] +#![feature(inline_const)] #![feature(inplace_iteration)] +#![cfg_attr(test, feature(is_sorted))] #![feature(iter_advance_by)] +#![feature(iter_next_chunk)] +#![feature(iter_repeat_n)] #![feature(layout_for_ptr)] #![feature(maybe_uninit_slice)] +#![feature(maybe_uninit_uninit_array)] +#![feature(maybe_uninit_uninit_array_transpose)] #![cfg_attr(test, feature(new_uninit))] #![feature(nonnull_slice_from_raw_parts)] #![feature(pattern)] +#![feature(pointer_byte_offsets)] +#![feature(provide_any)] #![feature(ptr_internals)] #![feature(ptr_metadata)] #![feature(ptr_sub_ptr)] #![feature(receiver_trait)] +#![feature(saturating_int_impl)] #![feature(set_ptr_value)] +#![feature(sized_type_properties)] +#![feature(slice_from_ptr_range)] #![feature(slice_group_by)] #![feature(slice_ptr_get)] #![feature(slice_ptr_len)] @@ -141,15 +158,17 @@ #![feature(trusted_len)] #![feature(trusted_random_access)] #![feature(try_trait_v2)] +#![feature(tuple_trait)] #![feature(unchecked_math)] #![feature(unicode_internals)] #![feature(unsize)] +#![feature(utf8_chunks)] +#![feature(std_internals)] // // Language features: #![feature(allocator_internals)] #![feature(allow_internal_unstable)] #![feature(associated_type_bounds)] -#![feature(box_syntax)] #![feature(cfg_sanitize)] #![feature(const_deref)] #![feature(const_mut_refs)] @@ -163,19 +182,21 @@ #![cfg_attr(not(test), feature(generator_trait))] #![feature(hashmap_internals)] #![feature(lang_items)] -#![feature(let_else)] #![feature(min_specialization)] #![feature(negative_impls)] #![feature(never_type)] -#![feature(nll)] // Not necessary, but here to test the `nll` feature. #![feature(rustc_allow_const_fn_unstable)] #![feature(rustc_attrs)] +#![feature(pointer_is_aligned)] #![feature(slice_internals)] #![feature(staged_api)] +#![feature(stmt_expr_attributes)] #![cfg_attr(test, feature(test))] #![feature(unboxed_closures)] #![feature(unsized_fn_params)] #![feature(c_unwind)] +#![feature(with_negative_coherence)] +#![cfg_attr(test, feature(panic_update_hook))] // // Rustdoc features: #![feature(doc_cfg)] @@ -192,6 +213,8 @@ extern crate std; #[cfg(test)] extern crate test; +#[cfg(test)] +mod testing; // Module with internal macros used by other modules (needs to be included before other modules). #[cfg(not(no_macros))] @@ -218,7 +241,7 @@ mod boxed { #[cfg(not(no_borrow))] pub mod borrow; pub mod collections; -#[cfg(not(no_global_oom_handling))] +#[cfg(all(not(no_rc), not(no_sync), not(no_global_oom_handling)))] pub mod ffi; #[cfg(not(no_fmt))] pub mod fmt; @@ -229,10 +252,9 @@ pub mod slice; pub mod str; #[cfg(not(no_string))] pub mod string; -#[cfg(not(no_sync))] -#[cfg(target_has_atomic = "ptr")] +#[cfg(all(not(no_rc), not(no_sync), target_has_atomic = "ptr"))] pub mod sync; -#[cfg(all(not(no_global_oom_handling), target_has_atomic = "ptr"))] +#[cfg(all(not(no_global_oom_handling), not(no_rc), not(no_sync), target_has_atomic = "ptr"))] pub mod task; #[cfg(test)] mod tests; @@ -243,3 +265,20 @@ pub mod vec; pub mod __export { pub use core::format_args; } + +#[cfg(test)] +#[allow(dead_code)] // Not used in all configurations +pub(crate) mod test_helpers { + /// Copied from `std::test_helpers::test_rng`, since these tests rely on the + /// seed not being the same for every RNG invocation too. + pub(crate) fn test_rng() -> rand_xorshift::XorShiftRng { + use std::hash::{BuildHasher, Hash, Hasher}; + let mut hasher = std::collections::hash_map::RandomState::new().build_hasher(); + std::panic::Location::caller().hash(&mut hasher); + let hc64 = hasher.finish(); + let seed_vec = + hc64.to_le_bytes().into_iter().chain(0u8..8).collect::<crate::vec::Vec<u8>>(); + let seed: [u8; 16] = seed_vec.as_slice().try_into().unwrap(); + rand::SeedableRng::from_seed(seed) + } +} diff --git a/rust/alloc/raw_vec.rs b/rust/alloc/raw_vec.rs index eb77db5def55..5db87eac53b7 100644 --- a/rust/alloc/raw_vec.rs +++ b/rust/alloc/raw_vec.rs @@ -5,7 +5,7 @@ use core::alloc::LayoutError; use core::cmp; use core::intrinsics; -use core::mem::{self, ManuallyDrop, MaybeUninit}; +use core::mem::{self, ManuallyDrop, MaybeUninit, SizedTypeProperties}; use core::ops::Drop; use core::ptr::{self, NonNull, Unique}; use core::slice; @@ -177,7 +177,7 @@ impl<T, A: Allocator> RawVec<T, A> { #[cfg(not(no_global_oom_handling))] fn allocate_in(capacity: usize, init: AllocInit, alloc: A) -> Self { // Don't allocate here because `Drop` will not deallocate when `capacity` is 0. - if mem::size_of::<T>() == 0 || capacity == 0 { + if T::IS_ZST || capacity == 0 { Self::new_in(alloc) } else { // We avoid `unwrap_or_else` here because it bloats the amount of @@ -212,7 +212,7 @@ impl<T, A: Allocator> RawVec<T, A> { fn try_allocate_in(capacity: usize, init: AllocInit, alloc: A) -> Result<Self, TryReserveError> { // Don't allocate here because `Drop` will not deallocate when `capacity` is 0. - if mem::size_of::<T>() == 0 || capacity == 0 { + if T::IS_ZST || capacity == 0 { return Ok(Self::new_in(alloc)); } @@ -262,7 +262,7 @@ impl<T, A: Allocator> RawVec<T, A> { /// This will always be `usize::MAX` if `T` is zero-sized. #[inline(always)] pub fn capacity(&self) -> usize { - if mem::size_of::<T>() == 0 { usize::MAX } else { self.cap } + if T::IS_ZST { usize::MAX } else { self.cap } } /// Returns a shared reference to the allocator backing this `RawVec`. @@ -271,7 +271,7 @@ impl<T, A: Allocator> RawVec<T, A> { } fn current_memory(&self) -> Option<(NonNull<u8>, Layout)> { - if mem::size_of::<T>() == 0 || self.cap == 0 { + if T::IS_ZST || self.cap == 0 { None } else { // We have an allocated chunk of memory, so we can bypass runtime @@ -419,7 +419,7 @@ impl<T, A: Allocator> RawVec<T, A> { // This is ensured by the calling contexts. debug_assert!(additional > 0); - if mem::size_of::<T>() == 0 { + if T::IS_ZST { // Since we return a capacity of `usize::MAX` when `elem_size` is // 0, getting to here necessarily means the `RawVec` is overfull. return Err(CapacityOverflow.into()); @@ -445,7 +445,7 @@ impl<T, A: Allocator> RawVec<T, A> { // `grow_amortized`, but this method is usually instantiated less often so // it's less critical. fn grow_exact(&mut self, len: usize, additional: usize) -> Result<(), TryReserveError> { - if mem::size_of::<T>() == 0 { + if T::IS_ZST { // Since we return a capacity of `usize::MAX` when the type size is // 0, getting to here necessarily means the `RawVec` is overfull. return Err(CapacityOverflow.into()); @@ -460,7 +460,7 @@ impl<T, A: Allocator> RawVec<T, A> { Ok(()) } - #[allow(dead_code)] + #[cfg(not(no_global_oom_handling))] fn shrink(&mut self, cap: usize) -> Result<(), TryReserveError> { assert!(cap <= self.capacity(), "Tried to shrink to a larger capacity"); diff --git a/rust/alloc/slice.rs b/rust/alloc/slice.rs index e444e97fa145..245e01590df7 100644 --- a/rust/alloc/slice.rs +++ b/rust/alloc/slice.rs @@ -1,84 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT -//! A dynamically-sized view into a contiguous sequence, `[T]`. +//! Utilities for the slice primitive type. //! //! *[See also the slice primitive type](slice).* //! -//! Slices are a view into a block of memory represented as a pointer and a -//! length. +//! Most of the structs in this module are iterator types which can only be created +//! using a certain function. For example, `slice.iter()` yields an [`Iter`]. //! -//! ``` -//! // slicing a Vec -//! let vec = vec![1, 2, 3]; -//! let int_slice = &vec[..]; -//! // coercing an array to a slice -//! let str_slice: &[&str] = &["one", "two", "three"]; -//! ``` -//! -//! Slices are either mutable or shared. The shared slice type is `&[T]`, -//! while the mutable slice type is `&mut [T]`, where `T` represents the element -//! type. For example, you can mutate the block of memory that a mutable slice -//! points to: -//! -//! ``` -//! let x = &mut [1, 2, 3]; -//! x[1] = 7; -//! assert_eq!(x, &[1, 7, 3]); -//! ``` -//! -//! Here are some of the things this module contains: -//! -//! ## Structs -//! -//! There are several structs that are useful for slices, such as [`Iter`], which -//! represents iteration over a slice. -//! -//! ## Trait Implementations -//! -//! There are several implementations of common traits for slices. Some examples -//! include: -//! -//! * [`Clone`] -//! * [`Eq`], [`Ord`] - for slices whose element type are [`Eq`] or [`Ord`]. -//! * [`Hash`] - for slices whose element type is [`Hash`]. -//! -//! ## Iteration -//! -//! The slices implement `IntoIterator`. The iterator yields references to the -//! slice elements. -//! -//! ``` -//! let numbers = &[0, 1, 2]; -//! for n in numbers { -//! println!("{n} is a number!"); -//! } -//! ``` -//! -//! The mutable slice yields mutable references to the elements: -//! -//! ``` -//! let mut scores = [7, 8, 9]; -//! for score in &mut scores[..] { -//! *score += 1; -//! } -//! ``` -//! -//! This iterator yields mutable references to the slice's elements, so while -//! the element type of the slice is `i32`, the element type of the iterator is -//! `&mut i32`. -//! -//! * [`.iter`] and [`.iter_mut`] are the explicit methods to return the default -//! iterators. -//! * Further methods that return iterators are [`.split`], [`.splitn`], -//! [`.chunks`], [`.windows`] and more. -//! -//! [`Hash`]: core::hash::Hash -//! [`.iter`]: slice::iter -//! [`.iter_mut`]: slice::iter_mut -//! [`.split`]: slice::split -//! [`.splitn`]: slice::splitn -//! [`.chunks`]: slice::chunks -//! [`.windows`]: slice::windows +//! A few functions are provided to create a slice from a value reference +//! or from a raw pointer. #![stable(feature = "rust1", since = "1.0.0")] // Many of the usings in this module are only used in the test configuration. // It's cleaner to just turn off the unused_imports warning than to fix them. @@ -88,20 +18,23 @@ use core::borrow::{Borrow, BorrowMut}; #[cfg(not(no_global_oom_handling))] use core::cmp::Ordering::{self, Less}; #[cfg(not(no_global_oom_handling))] -use core::mem; -#[cfg(not(no_global_oom_handling))] -use core::mem::size_of; +use core::mem::{self, SizedTypeProperties}; #[cfg(not(no_global_oom_handling))] use core::ptr; +#[cfg(not(no_global_oom_handling))] +use core::slice::sort; use crate::alloc::Allocator; #[cfg(not(no_global_oom_handling))] -use crate::alloc::Global; +use crate::alloc::{self, Global}; #[cfg(not(no_global_oom_handling))] use crate::borrow::ToOwned; use crate::boxed::Box; use crate::vec::Vec; +#[cfg(test)] +mod tests; + #[unstable(feature = "slice_range", issue = "76393")] pub use core::slice::range; #[unstable(feature = "array_chunks", issue = "74985")] @@ -116,6 +49,8 @@ pub use core::slice::EscapeAscii; pub use core::slice::SliceIndex; #[stable(feature = "from_ref", since = "1.28.0")] pub use core::slice::{from_mut, from_ref}; +#[unstable(feature = "slice_from_ptr_range", issue = "89792")] +pub use core::slice::{from_mut_ptr_range, from_ptr_range}; #[stable(feature = "rust1", since = "1.0.0")] pub use core::slice::{from_raw_parts, from_raw_parts_mut}; #[stable(feature = "rust1", since = "1.0.0")] @@ -275,7 +210,7 @@ impl<T> [T] { where T: Ord, { - merge_sort(self, |a, b| a.lt(b)); + stable_sort(self, T::lt); } /// Sorts the slice with a comparator function. @@ -331,7 +266,7 @@ impl<T> [T] { where F: FnMut(&T, &T) -> Ordering, { - merge_sort(self, |a, b| compare(a, b) == Less); + stable_sort(self, |a, b| compare(a, b) == Less); } /// Sorts the slice with a key extraction function. @@ -374,7 +309,7 @@ impl<T> [T] { F: FnMut(&T) -> K, K: Ord, { - merge_sort(self, |a, b| f(a).lt(&f(b))); + stable_sort(self, |a, b| f(a).lt(&f(b))); } /// Sorts the slice with a key extraction function. @@ -530,7 +465,7 @@ impl<T> [T] { hack::into_vec(self) } - /// Creates a vector by repeating a slice `n` times. + /// Creates a vector by copying a slice `n` times. /// /// # Panics /// @@ -725,7 +660,7 @@ impl [u8] { /// /// ```error /// error[E0207]: the type parameter `T` is not constrained by the impl trait, self type, or predica -/// --> src/liballoc/slice.rs:608:6 +/// --> library/alloc/src/slice.rs:608:6 /// | /// 608 | impl<T: Clone, V: Borrow<[T]>> Concat for [V] { /// | ^ unconstrained type parameter @@ -836,14 +771,14 @@ impl<T: Clone, V: Borrow<[T]>> Join<&[T]> for [V] { //////////////////////////////////////////////////////////////////////////////// #[stable(feature = "rust1", since = "1.0.0")] -impl<T> Borrow<[T]> for Vec<T> { +impl<T, A: Allocator> Borrow<[T]> for Vec<T, A> { fn borrow(&self) -> &[T] { &self[..] } } #[stable(feature = "rust1", since = "1.0.0")] -impl<T> BorrowMut<[T]> for Vec<T> { +impl<T, A: Allocator> BorrowMut<[T]> for Vec<T, A> { fn borrow_mut(&mut self) -> &mut [T] { &mut self[..] } @@ -881,324 +816,52 @@ impl<T: Clone> ToOwned for [T] { // Sorting //////////////////////////////////////////////////////////////////////////////// -/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted. -/// -/// This is the integral subroutine of insertion sort. +#[inline] #[cfg(not(no_global_oom_handling))] -fn insert_head<T, F>(v: &mut [T], is_less: &mut F) +fn stable_sort<T, F>(v: &mut [T], mut is_less: F) where F: FnMut(&T, &T) -> bool, { - if v.len() >= 2 && is_less(&v[1], &v[0]) { - unsafe { - // There are three ways to implement insertion here: - // - // 1. Swap adjacent elements until the first one gets to its final destination. - // However, this way we copy data around more than is necessary. If elements are big - // structures (costly to copy), this method will be slow. - // - // 2. Iterate until the right place for the first element is found. Then shift the - // elements succeeding it to make room for it and finally place it into the - // remaining hole. This is a good method. - // - // 3. Copy the first element into a temporary variable. Iterate until the right place - // for it is found. As we go along, copy every traversed element into the slot - // preceding it. Finally, copy data from the temporary variable into the remaining - // hole. This method is very good. Benchmarks demonstrated slightly better - // performance than with the 2nd method. - // - // All methods were benchmarked, and the 3rd showed best results. So we chose that one. - let tmp = mem::ManuallyDrop::new(ptr::read(&v[0])); - - // Intermediate state of the insertion process is always tracked by `hole`, which - // serves two purposes: - // 1. Protects integrity of `v` from panics in `is_less`. - // 2. Fills the remaining hole in `v` in the end. - // - // Panic safety: - // - // If `is_less` panics at any point during the process, `hole` will get dropped and - // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it - // initially held exactly once. - let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] }; - ptr::copy_nonoverlapping(&v[1], &mut v[0], 1); - - for i in 2..v.len() { - if !is_less(&v[i], &*tmp) { - break; - } - ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1); - hole.dest = &mut v[i]; - } - // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`. - } - } - - // When dropped, copies from `src` into `dest`. - struct InsertionHole<T> { - src: *const T, - dest: *mut T, - } - - impl<T> Drop for InsertionHole<T> { - fn drop(&mut self) { - unsafe { - ptr::copy_nonoverlapping(self.src, self.dest, 1); - } - } + if T::IS_ZST { + // Sorting has no meaningful behavior on zero-sized types. Do nothing. + return; } -} - -/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and -/// stores the result into `v[..]`. -/// -/// # Safety -/// -/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough -/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type. -#[cfg(not(no_global_oom_handling))] -unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - let v = v.as_mut_ptr(); - let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) }; - // The merge process first copies the shorter run into `buf`. Then it traces the newly copied - // run and the longer run forwards (or backwards), comparing their next unconsumed elements and - // copying the lesser (or greater) one into `v`. - // - // As soon as the shorter run is fully consumed, the process is done. If the longer run gets - // consumed first, then we must copy whatever is left of the shorter run into the remaining - // hole in `v`. - // - // Intermediate state of the process is always tracked by `hole`, which serves two purposes: - // 1. Protects integrity of `v` from panics in `is_less`. - // 2. Fills the remaining hole in `v` if the longer run gets consumed first. - // - // Panic safety: - // - // If `is_less` panics at any point during the process, `hole` will get dropped and fill the - // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every - // object it initially held exactly once. - let mut hole; + let elem_alloc_fn = |len: usize| -> *mut T { + // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len > + // v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap + // elements. + unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T } + }; - if mid <= len - mid { - // The left run is shorter. + let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| { + // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len > + // v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same + // len. unsafe { - ptr::copy_nonoverlapping(v, buf, mid); - hole = MergeHole { start: buf, end: buf.add(mid), dest: v }; + alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked()); } + }; - // Initially, these pointers point to the beginnings of their arrays. - let left = &mut hole.start; - let mut right = v_mid; - let out = &mut hole.dest; - - while *left < hole.end && right < v_end { - // Consume the lesser side. - // If equal, prefer the left run to maintain stability. - unsafe { - let to_copy = if is_less(&*right, &**left) { - get_and_increment(&mut right) - } else { - get_and_increment(left) - }; - ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1); - } - } - } else { - // The right run is shorter. + let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun { + // SAFETY: Creating the layout is safe as long as merge_sort never calls this with an + // obscene length or 0. unsafe { - ptr::copy_nonoverlapping(v_mid, buf, len - mid); - hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid }; + alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked()) + as *mut sort::TimSortRun } + }; - // Initially, these pointers point past the ends of their arrays. - let left = &mut hole.dest; - let right = &mut hole.end; - let mut out = v_end; - - while v < *left && buf < *right { - // Consume the greater side. - // If equal, prefer the right run to maintain stability. - unsafe { - let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) { - decrement_and_get(left) - } else { - decrement_and_get(right) - }; - ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1); - } - } - } - // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of - // it will now be copied into the hole in `v`. - - unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T { - let old = *ptr; - *ptr = unsafe { ptr.offset(1) }; - old - } - - unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T { - *ptr = unsafe { ptr.offset(-1) }; - *ptr - } - - // When dropped, copies the range `start..end` into `dest..`. - struct MergeHole<T> { - start: *mut T, - end: *mut T, - dest: *mut T, - } - - impl<T> Drop for MergeHole<T> { - fn drop(&mut self) { - // `T` is not a zero-sized type, and these are pointers into a slice's elements. - unsafe { - let len = self.end.sub_ptr(self.start); - ptr::copy_nonoverlapping(self.start, self.dest, len); - } - } - } -} - -/// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail -/// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). -/// -/// The algorithm identifies strictly descending and non-descending subsequences, which are called -/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed -/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are -/// satisfied: -/// -/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len` -/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len` -/// -/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case. -#[cfg(not(no_global_oom_handling))] -fn merge_sort<T, F>(v: &mut [T], mut is_less: F) -where - F: FnMut(&T, &T) -> bool, -{ - // Slices of up to this length get sorted using insertion sort. - const MAX_INSERTION: usize = 20; - // Very short runs are extended using insertion sort to span at least this many elements. - const MIN_RUN: usize = 10; - - // Sorting has no meaningful behavior on zero-sized types. - if size_of::<T>() == 0 { - return; - } - - let len = v.len(); - - // Short arrays get sorted in-place via insertion sort to avoid allocations. - if len <= MAX_INSERTION { - if len >= 2 { - for i in (0..len - 1).rev() { - insert_head(&mut v[i..], &mut is_less); - } - } - return; - } - - // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it - // shallow copies of the contents of `v` without risking the dtors running on copies if - // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run, - // which will always have length at most `len / 2`. - let mut buf = Vec::with_capacity(len / 2); - - // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a - // strange decision, but consider the fact that merges more often go in the opposite direction - // (forwards). According to benchmarks, merging forwards is slightly faster than merging - // backwards. To conclude, identifying runs by traversing backwards improves performance. - let mut runs = vec![]; - let mut end = len; - while end > 0 { - // Find the next natural run, and reverse it if it's strictly descending. - let mut start = end - 1; - if start > 0 { - start -= 1; - unsafe { - if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) { - while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) { - start -= 1; - } - v[start..end].reverse(); - } else { - while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) - { - start -= 1; - } - } - } - } - - // Insert some more elements into the run if it's too short. Insertion sort is faster than - // merge sort on short sequences, so this significantly improves performance. - while start > 0 && end - start < MIN_RUN { - start -= 1; - insert_head(&mut v[start..end], &mut is_less); - } - - // Push this run onto the stack. - runs.push(Run { start, len: end - start }); - end = start; - - // Merge some pairs of adjacent runs to satisfy the invariants. - while let Some(r) = collapse(&runs) { - let left = runs[r + 1]; - let right = runs[r]; - unsafe { - merge( - &mut v[left.start..right.start + right.len], - left.len, - buf.as_mut_ptr(), - &mut is_less, - ); - } - runs[r] = Run { start: left.start, len: left.len + right.len }; - runs.remove(r + 1); - } - } - - // Finally, exactly one run must remain in the stack. - debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len); - - // Examines the stack of runs and identifies the next pair of runs to merge. More specifically, - // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the - // algorithm should continue building a new run instead, `None` is returned. - // - // TimSort is infamous for its buggy implementations, as described here: - // http://envisage-project.eu/timsort-specification-and-verification/ - // - // The gist of the story is: we must enforce the invariants on the top four runs on the stack. - // Enforcing them on just top three is not sufficient to ensure that the invariants will still - // hold for *all* runs in the stack. - // - // This function correctly checks invariants for the top four runs. Additionally, if the top - // run starts at index 0, it will always demand a merge operation until the stack is fully - // collapsed, in order to complete the sort. - #[inline] - fn collapse(runs: &[Run]) -> Option<usize> { - let n = runs.len(); - if n >= 2 - && (runs[n - 1].start == 0 - || runs[n - 2].len <= runs[n - 1].len - || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len) - || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len)) - { - if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) } - } else { - None + let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| { + // SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same + // len. + unsafe { + alloc::dealloc( + buf_ptr as *mut u8, + alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(), + ); } - } + }; - #[derive(Clone, Copy)] - struct Run { - start: usize, - len: usize, - } + sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn); } diff --git a/rust/alloc/vec/drain.rs b/rust/alloc/vec/drain.rs index b6a5f98e4fcd..d503d2f478ce 100644 --- a/rust/alloc/vec/drain.rs +++ b/rust/alloc/vec/drain.rs @@ -3,7 +3,7 @@ use crate::alloc::{Allocator, Global}; use core::fmt; use core::iter::{FusedIterator, TrustedLen}; -use core::mem; +use core::mem::{self, ManuallyDrop, SizedTypeProperties}; use core::ptr::{self, NonNull}; use core::slice::{self}; @@ -67,6 +67,77 @@ impl<'a, T, A: Allocator> Drain<'a, T, A> { pub fn allocator(&self) -> &A { unsafe { self.vec.as_ref().allocator() } } + + /// Keep unyielded elements in the source `Vec`. + /// + /// # Examples + /// + /// ``` + /// #![feature(drain_keep_rest)] + /// + /// let mut vec = vec!['a', 'b', 'c']; + /// let mut drain = vec.drain(..); + /// + /// assert_eq!(drain.next().unwrap(), 'a'); + /// + /// // This call keeps 'b' and 'c' in the vec. + /// drain.keep_rest(); + /// + /// // If we wouldn't call `keep_rest()`, + /// // `vec` would be empty. + /// assert_eq!(vec, ['b', 'c']); + /// ``` + #[unstable(feature = "drain_keep_rest", issue = "101122")] + pub fn keep_rest(self) { + // At this moment layout looks like this: + // + // [head] [yielded by next] [unyielded] [yielded by next_back] [tail] + // ^-- start \_________/-- unyielded_len \____/-- self.tail_len + // ^-- unyielded_ptr ^-- tail + // + // Normally `Drop` impl would drop [unyielded] and then move [tail] to the `start`. + // Here we want to + // 1. Move [unyielded] to `start` + // 2. Move [tail] to a new start at `start + len(unyielded)` + // 3. Update length of the original vec to `len(head) + len(unyielded) + len(tail)` + // a. In case of ZST, this is the only thing we want to do + // 4. Do *not* drop self, as everything is put in a consistent state already, there is nothing to do + let mut this = ManuallyDrop::new(self); + + unsafe { + let source_vec = this.vec.as_mut(); + + let start = source_vec.len(); + let tail = this.tail_start; + + let unyielded_len = this.iter.len(); + let unyielded_ptr = this.iter.as_slice().as_ptr(); + + // ZSTs have no identity, so we don't need to move them around. + let needs_move = mem::size_of::<T>() != 0; + + if needs_move { + let start_ptr = source_vec.as_mut_ptr().add(start); + + // memmove back unyielded elements + if unyielded_ptr != start_ptr { + let src = unyielded_ptr; + let dst = start_ptr; + + ptr::copy(src, dst, unyielded_len); + } + + // memmove back untouched tail + if tail != (start + unyielded_len) { + let src = source_vec.as_ptr().add(tail); + let dst = start_ptr.add(unyielded_len); + ptr::copy(src, dst, this.tail_len); + } + } + + source_vec.set_len(start + unyielded_len + this.tail_len); + } + } } #[stable(feature = "vec_drain_as_slice", since = "1.46.0")] @@ -133,7 +204,7 @@ impl<T, A: Allocator> Drop for Drain<'_, T, A> { let mut vec = self.vec; - if mem::size_of::<T>() == 0 { + if T::IS_ZST { // ZSTs have no identity, so we don't need to move them around, we only need to drop the correct amount. // this can be achieved by manipulating the Vec length instead of moving values out from `iter`. unsafe { @@ -154,9 +225,9 @@ impl<T, A: Allocator> Drop for Drain<'_, T, A> { } // as_slice() must only be called when iter.len() is > 0 because - // vec::Splice modifies vec::Drain fields and may grow the vec which would invalidate - // the iterator's internal pointers. Creating a reference to deallocated memory - // is invalid even when it is zero-length + // it also gets touched by vec::Splice which may turn it into a dangling pointer + // which would make it and the vec pointer point to different allocations which would + // lead to invalid pointer arithmetic below. let drop_ptr = iter.as_slice().as_ptr(); unsafe { diff --git a/rust/alloc/vec/drain_filter.rs b/rust/alloc/vec/drain_filter.rs index b04fce041622..4b019220657d 100644 --- a/rust/alloc/vec/drain_filter.rs +++ b/rust/alloc/vec/drain_filter.rs @@ -1,8 +1,9 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT use crate::alloc::{Allocator, Global}; -use core::ptr::{self}; -use core::slice::{self}; +use core::mem::{self, ManuallyDrop}; +use core::ptr; +use core::slice; use super::Vec; @@ -56,6 +57,61 @@ where pub fn allocator(&self) -> &A { self.vec.allocator() } + + /// Keep unyielded elements in the source `Vec`. + /// + /// # Examples + /// + /// ``` + /// #![feature(drain_filter)] + /// #![feature(drain_keep_rest)] + /// + /// let mut vec = vec!['a', 'b', 'c']; + /// let mut drain = vec.drain_filter(|_| true); + /// + /// assert_eq!(drain.next().unwrap(), 'a'); + /// + /// // This call keeps 'b' and 'c' in the vec. + /// drain.keep_rest(); + /// + /// // If we wouldn't call `keep_rest()`, + /// // `vec` would be empty. + /// assert_eq!(vec, ['b', 'c']); + /// ``` + #[unstable(feature = "drain_keep_rest", issue = "101122")] + pub fn keep_rest(self) { + // At this moment layout looks like this: + // + // _____________________/-- old_len + // / \ + // [kept] [yielded] [tail] + // \_______/ ^-- idx + // \-- del + // + // Normally `Drop` impl would drop [tail] (via .for_each(drop), ie still calling `pred`) + // + // 1. Move [tail] after [kept] + // 2. Update length of the original vec to `old_len - del` + // a. In case of ZST, this is the only thing we want to do + // 3. Do *not* drop self, as everything is put in a consistent state already, there is nothing to do + let mut this = ManuallyDrop::new(self); + + unsafe { + // ZSTs have no identity, so we don't need to move them around. + let needs_move = mem::size_of::<T>() != 0; + + if needs_move && this.idx < this.old_len && this.del > 0 { + let ptr = this.vec.as_mut_ptr(); + let src = ptr.add(this.idx); + let dst = src.sub(this.del); + let tail_len = this.old_len - this.idx; + src.copy_to(dst, tail_len); + } + + let new_len = this.old_len - this.del; + this.vec.set_len(new_len); + } + } } #[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")] diff --git a/rust/alloc/vec/into_iter.rs b/rust/alloc/vec/into_iter.rs index f7a50e76691e..34a2a70d6ded 100644 --- a/rust/alloc/vec/into_iter.rs +++ b/rust/alloc/vec/into_iter.rs @@ -3,14 +3,16 @@ #[cfg(not(no_global_oom_handling))] use super::AsVecIntoIter; use crate::alloc::{Allocator, Global}; +#[cfg(not(no_global_oom_handling))] +use crate::collections::VecDeque; use crate::raw_vec::RawVec; +use core::array; use core::fmt; -use core::intrinsics::arith_offset; use core::iter::{ FusedIterator, InPlaceIterable, SourceIter, TrustedLen, TrustedRandomAccessNoCoerce, }; use core::marker::PhantomData; -use core::mem::{self, ManuallyDrop}; +use core::mem::{self, ManuallyDrop, MaybeUninit, SizedTypeProperties}; #[cfg(not(no_global_oom_handling))] use core::ops::Deref; use core::ptr::{self, NonNull}; @@ -40,7 +42,9 @@ pub struct IntoIter< // to avoid dropping the allocator twice we need to wrap it into ManuallyDrop pub(super) alloc: ManuallyDrop<A>, pub(super) ptr: *const T, - pub(super) end: *const T, + pub(super) end: *const T, // If T is a ZST, this is actually ptr+len. This encoding is picked so that + // ptr == end is a quick test for the Iterator being empty, that works + // for both ZST and non-ZST. } #[stable(feature = "vec_intoiter_debug", since = "1.13.0")] @@ -97,13 +101,16 @@ impl<T, A: Allocator> IntoIter<T, A> { } /// Drops remaining elements and relinquishes the backing allocation. + /// This method guarantees it won't panic before relinquishing + /// the backing allocation. /// /// This is roughly equivalent to the following, but more efficient /// /// ``` /// # let mut into_iter = Vec::<u8>::with_capacity(10).into_iter(); + /// let mut into_iter = std::mem::replace(&mut into_iter, Vec::new().into_iter()); /// (&mut into_iter).for_each(core::mem::drop); - /// unsafe { core::ptr::write(&mut into_iter, Vec::new().into_iter()); } + /// std::mem::forget(into_iter); /// ``` /// /// This method is used by in-place iteration, refer to the vec::in_place_collect @@ -120,15 +127,45 @@ impl<T, A: Allocator> IntoIter<T, A> { self.ptr = self.buf.as_ptr(); self.end = self.buf.as_ptr(); + // Dropping the remaining elements can panic, so this needs to be + // done only after updating the other fields. unsafe { ptr::drop_in_place(remaining); } } /// Forgets to Drop the remaining elements while still allowing the backing allocation to be freed. - #[allow(dead_code)] pub(crate) fn forget_remaining_elements(&mut self) { - self.ptr = self.end; + // For th ZST case, it is crucial that we mutate `end` here, not `ptr`. + // `ptr` must stay aligned, while `end` may be unaligned. + self.end = self.ptr; + } + + #[cfg(not(no_global_oom_handling))] + #[inline] + pub(crate) fn into_vecdeque(self) -> VecDeque<T, A> { + // Keep our `Drop` impl from dropping the elements and the allocator + let mut this = ManuallyDrop::new(self); + + // SAFETY: This allocation originally came from a `Vec`, so it passes + // all those checks. We have `this.buf` ≤ `this.ptr` ≤ `this.end`, + // so the `sub_ptr`s below cannot wrap, and will produce a well-formed + // range. `end` ≤ `buf + cap`, so the range will be in-bounds. + // Taking `alloc` is ok because nothing else is going to look at it, + // since our `Drop` impl isn't going to run so there's no more code. + unsafe { + let buf = this.buf.as_ptr(); + let initialized = if T::IS_ZST { + // All the pointers are the same for ZSTs, so it's fine to + // say that they're all at the beginning of the "allocation". + 0..this.len() + } else { + this.ptr.sub_ptr(buf)..this.end.sub_ptr(buf) + }; + let cap = this.cap; + let alloc = ManuallyDrop::take(&mut this.alloc); + VecDeque::from_contiguous_raw_parts_in(buf, initialized, cap, alloc) + } } } @@ -150,19 +187,18 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> { #[inline] fn next(&mut self) -> Option<T> { - if self.ptr as *const _ == self.end { + if self.ptr == self.end { None - } else if mem::size_of::<T>() == 0 { - // purposefully don't use 'ptr.offset' because for - // vectors with 0-size elements this would return the - // same pointer. - self.ptr = unsafe { arith_offset(self.ptr as *const i8, 1) as *mut T }; + } else if T::IS_ZST { + // `ptr` has to stay where it is to remain aligned, so we reduce the length by 1 by + // reducing the `end`. + self.end = self.end.wrapping_byte_sub(1); // Make up a value of this ZST. Some(unsafe { mem::zeroed() }) } else { let old = self.ptr; - self.ptr = unsafe { self.ptr.offset(1) }; + self.ptr = unsafe { self.ptr.add(1) }; Some(unsafe { ptr::read(old) }) } @@ -170,7 +206,7 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> { #[inline] fn size_hint(&self) -> (usize, Option<usize>) { - let exact = if mem::size_of::<T>() == 0 { + let exact = if T::IS_ZST { self.end.addr().wrapping_sub(self.ptr.addr()) } else { unsafe { self.end.sub_ptr(self.ptr) } @@ -182,11 +218,9 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> { fn advance_by(&mut self, n: usize) -> Result<(), usize> { let step_size = self.len().min(n); let to_drop = ptr::slice_from_raw_parts_mut(self.ptr as *mut T, step_size); - if mem::size_of::<T>() == 0 { - // SAFETY: due to unchecked casts of unsigned amounts to signed offsets the wraparound - // effectively results in unsigned pointers representing positions 0..usize::MAX, - // which is valid for ZSTs. - self.ptr = unsafe { arith_offset(self.ptr as *const i8, step_size as isize) as *mut T } + if T::IS_ZST { + // See `next` for why we sub `end` here. + self.end = self.end.wrapping_byte_sub(step_size); } else { // SAFETY: the min() above ensures that step_size is in bounds self.ptr = unsafe { self.ptr.add(step_size) }; @@ -206,6 +240,43 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> { self.len() } + #[inline] + fn next_chunk<const N: usize>(&mut self) -> Result<[T; N], core::array::IntoIter<T, N>> { + let mut raw_ary = MaybeUninit::uninit_array(); + + let len = self.len(); + + if T::IS_ZST { + if len < N { + self.forget_remaining_elements(); + // Safety: ZSTs can be conjured ex nihilo, only the amount has to be correct + return Err(unsafe { array::IntoIter::new_unchecked(raw_ary, 0..len) }); + } + + self.end = self.end.wrapping_byte_sub(N); + // Safety: ditto + return Ok(unsafe { raw_ary.transpose().assume_init() }); + } + + if len < N { + // Safety: `len` indicates that this many elements are available and we just checked that + // it fits into the array. + unsafe { + ptr::copy_nonoverlapping(self.ptr, raw_ary.as_mut_ptr() as *mut T, len); + self.forget_remaining_elements(); + return Err(array::IntoIter::new_unchecked(raw_ary, 0..len)); + } + } + + // Safety: `len` is larger than the array size. Copy a fixed amount here to fully initialize + // the array. + return unsafe { + ptr::copy_nonoverlapping(self.ptr, raw_ary.as_mut_ptr() as *mut T, N); + self.ptr = self.ptr.add(N); + Ok(raw_ary.transpose().assume_init()) + }; + } + unsafe fn __iterator_get_unchecked(&mut self, i: usize) -> Self::Item where Self: TrustedRandomAccessNoCoerce, @@ -219,7 +290,7 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> { // that `T: Copy` so reading elements from the buffer doesn't invalidate // them for `Drop`. unsafe { - if mem::size_of::<T>() == 0 { mem::zeroed() } else { ptr::read(self.ptr.add(i)) } + if T::IS_ZST { mem::zeroed() } else { ptr::read(self.ptr.add(i)) } } } } @@ -230,14 +301,14 @@ impl<T, A: Allocator> DoubleEndedIterator for IntoIter<T, A> { fn next_back(&mut self) -> Option<T> { if self.end == self.ptr { None - } else if mem::size_of::<T>() == 0 { + } else if T::IS_ZST { // See above for why 'ptr.offset' isn't used - self.end = unsafe { arith_offset(self.end as *const i8, -1) as *mut T }; + self.end = self.end.wrapping_byte_sub(1); // Make up a value of this ZST. Some(unsafe { mem::zeroed() }) } else { - self.end = unsafe { self.end.offset(-1) }; + self.end = unsafe { self.end.sub(1) }; Some(unsafe { ptr::read(self.end) }) } @@ -246,14 +317,12 @@ impl<T, A: Allocator> DoubleEndedIterator for IntoIter<T, A> { #[inline] fn advance_back_by(&mut self, n: usize) -> Result<(), usize> { let step_size = self.len().min(n); - if mem::size_of::<T>() == 0 { + if T::IS_ZST { // SAFETY: same as for advance_by() - self.end = unsafe { - arith_offset(self.end as *const i8, step_size.wrapping_neg() as isize) as *mut T - } + self.end = self.end.wrapping_byte_sub(step_size); } else { // SAFETY: same as for advance_by() - self.end = unsafe { self.end.offset(step_size.wrapping_neg() as isize) }; + self.end = unsafe { self.end.sub(step_size) }; } let to_drop = ptr::slice_from_raw_parts_mut(self.end as *mut T, step_size); // SAFETY: same as for advance_by() diff --git a/rust/alloc/vec/is_zero.rs b/rust/alloc/vec/is_zero.rs index 377f3d172777..d928dcf90e80 100644 --- a/rust/alloc/vec/is_zero.rs +++ b/rust/alloc/vec/is_zero.rs @@ -1,10 +1,13 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT +use core::num::{Saturating, Wrapping}; + use crate::boxed::Box; #[rustc_specialization_trait] pub(super) unsafe trait IsZero { - /// Whether this value's representation is all zeros + /// Whether this value's representation is all zeros, + /// or can be represented with all zeroes. fn is_zero(&self) -> bool; } @@ -19,12 +22,14 @@ macro_rules! impl_is_zero { }; } +impl_is_zero!(i8, |x| x == 0); // It is needed to impl for arrays and tuples of i8. impl_is_zero!(i16, |x| x == 0); impl_is_zero!(i32, |x| x == 0); impl_is_zero!(i64, |x| x == 0); impl_is_zero!(i128, |x| x == 0); impl_is_zero!(isize, |x| x == 0); +impl_is_zero!(u8, |x| x == 0); // It is needed to impl for arrays and tuples of u8. impl_is_zero!(u16, |x| x == 0); impl_is_zero!(u32, |x| x == 0); impl_is_zero!(u64, |x| x == 0); @@ -55,16 +60,42 @@ unsafe impl<T: IsZero, const N: usize> IsZero for [T; N] { #[inline] fn is_zero(&self) -> bool { // Because this is generated as a runtime check, it's not obvious that - // it's worth doing if the array is really long. The threshold here - // is largely arbitrary, but was picked because as of 2022-05-01 LLVM - // can const-fold the check in `vec![[0; 32]; n]` but not in - // `vec![[0; 64]; n]`: https://godbolt.org/z/WTzjzfs5b + // it's worth doing if the array is really long. The threshold here + // is largely arbitrary, but was picked because as of 2022-07-01 LLVM + // fails to const-fold the check in `vec![[1; 32]; n]` + // See https://github.com/rust-lang/rust/pull/97581#issuecomment-1166628022 // Feel free to tweak if you have better evidence. - N <= 32 && self.iter().all(IsZero::is_zero) + N <= 16 && self.iter().all(IsZero::is_zero) + } +} + +// This is recursive macro. +macro_rules! impl_for_tuples { + // Stopper + () => { + // No use for implementing for empty tuple because it is ZST. + }; + ($first_arg:ident $(,$rest:ident)*) => { + unsafe impl <$first_arg: IsZero, $($rest: IsZero,)*> IsZero for ($first_arg, $($rest,)*){ + #[inline] + fn is_zero(&self) -> bool{ + // Destructure tuple to N references + // Rust allows to hide generic params by local variable names. + #[allow(non_snake_case)] + let ($first_arg, $($rest,)*) = self; + + $first_arg.is_zero() + $( && $rest.is_zero() )* + } + } + + impl_for_tuples!($($rest),*); } } +impl_for_tuples!(A, B, C, D, E, F, G, H); + // `Option<&T>` and `Option<Box<T>>` are guaranteed to represent `None` as null. // For fat pointers, the bytes that would be the pointer metadata in the `Some` // variant are padding in the `None` variant, so ignoring them and @@ -118,3 +149,56 @@ impl_is_zero_option_of_nonzero!( NonZeroUsize, NonZeroIsize, ); + +macro_rules! impl_is_zero_option_of_num { + ($($t:ty,)+) => {$( + unsafe impl IsZero for Option<$t> { + #[inline] + fn is_zero(&self) -> bool { + const { + let none: Self = unsafe { core::mem::MaybeUninit::zeroed().assume_init() }; + assert!(none.is_none()); + } + self.is_none() + } + } + )+}; +} + +impl_is_zero_option_of_num!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128, usize, isize,); + +unsafe impl<T: IsZero> IsZero for Wrapping<T> { + #[inline] + fn is_zero(&self) -> bool { + self.0.is_zero() + } +} + +unsafe impl<T: IsZero> IsZero for Saturating<T> { + #[inline] + fn is_zero(&self) -> bool { + self.0.is_zero() + } +} + +macro_rules! impl_for_optional_bool { + ($($t:ty,)+) => {$( + unsafe impl IsZero for $t { + #[inline] + fn is_zero(&self) -> bool { + // SAFETY: This is *not* a stable layout guarantee, but + // inside `core` we're allowed to rely on the current rustc + // behaviour that options of bools will be one byte with + // no padding, so long as they're nested less than 254 deep. + let raw: u8 = unsafe { core::mem::transmute(*self) }; + raw == 0 + } + } + )+}; +} +impl_for_optional_bool! { + Option<bool>, + Option<Option<bool>>, + Option<Option<Option<bool>>>, + // Could go further, but not worth the metadata overhead +} diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs index fe4fff5064bc..94995913566b 100644 --- a/rust/alloc/vec/mod.rs +++ b/rust/alloc/vec/mod.rs @@ -61,12 +61,12 @@ use core::cmp::Ordering; use core::convert::TryFrom; use core::fmt; use core::hash::{Hash, Hasher}; -use core::intrinsics::{arith_offset, assume}; +use core::intrinsics::assume; use core::iter; #[cfg(not(no_global_oom_handling))] use core::iter::FromIterator; use core::marker::PhantomData; -use core::mem::{self, ManuallyDrop, MaybeUninit}; +use core::mem::{self, ManuallyDrop, MaybeUninit, SizedTypeProperties}; use core::ops::{self, Index, IndexMut, Range, RangeBounds}; use core::ptr::{self, NonNull}; use core::slice::{self, SliceIndex}; @@ -75,7 +75,7 @@ use crate::alloc::{Allocator, Global}; #[cfg(not(no_borrow))] use crate::borrow::{Cow, ToOwned}; use crate::boxed::Box; -use crate::collections::TryReserveError; +use crate::collections::{TryReserveError, TryReserveErrorKind}; use crate::raw_vec::RawVec; #[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")] @@ -127,7 +127,7 @@ use self::set_len_on_drop::SetLenOnDrop; mod set_len_on_drop; #[cfg(not(no_global_oom_handling))] -use self::in_place_drop::InPlaceDrop; +use self::in_place_drop::{InPlaceDrop, InPlaceDstBufDrop}; #[cfg(not(no_global_oom_handling))] mod in_place_drop; @@ -169,7 +169,7 @@ mod spec_extend; /// vec[0] = 7; /// assert_eq!(vec[0], 7); /// -/// vec.extend([1, 2, 3].iter().copied()); +/// vec.extend([1, 2, 3]); /// /// for x in &vec { /// println!("{x}"); @@ -428,17 +428,25 @@ impl<T> Vec<T> { Vec { buf: RawVec::NEW, len: 0 } } - /// Constructs a new, empty `Vec<T>` with the specified capacity. + /// Constructs a new, empty `Vec<T>` with at least the specified capacity. /// - /// The vector will be able to hold exactly `capacity` elements without - /// reallocating. If `capacity` is 0, the vector will not allocate. + /// The vector will be able to hold at least `capacity` elements without + /// reallocating. This method is allowed to allocate for more elements than + /// `capacity`. If `capacity` is 0, the vector will not allocate. /// /// It is important to note that although the returned vector has the - /// *capacity* specified, the vector will have a zero *length*. For an - /// explanation of the difference between length and capacity, see + /// minimum *capacity* specified, the vector will have a zero *length*. For + /// an explanation of the difference between length and capacity, see /// *[Capacity and reallocation]*. /// + /// If it is important to know the exact allocated capacity of a `Vec`, + /// always use the [`capacity`] method after construction. + /// + /// For `Vec<T>` where `T` is a zero-sized type, there will be no allocation + /// and the capacity will always be `usize::MAX`. + /// /// [Capacity and reallocation]: #capacity-and-reallocation + /// [`capacity`]: Vec::capacity /// /// # Panics /// @@ -451,19 +459,24 @@ impl<T> Vec<T> { /// /// // The vector contains no items, even though it has capacity for more /// assert_eq!(vec.len(), 0); - /// assert_eq!(vec.capacity(), 10); + /// assert!(vec.capacity() >= 10); /// /// // These are all done without reallocating... /// for i in 0..10 { /// vec.push(i); /// } /// assert_eq!(vec.len(), 10); - /// assert_eq!(vec.capacity(), 10); + /// assert!(vec.capacity() >= 10); /// /// // ...but this may make the vector reallocate /// vec.push(11); /// assert_eq!(vec.len(), 11); /// assert!(vec.capacity() >= 11); + /// + /// // A vector of a zero-sized type will always over-allocate, since no + /// // allocation is necessary + /// let vec_units = Vec::<()>::with_capacity(10); + /// assert_eq!(vec_units.capacity(), usize::MAX); /// ``` #[cfg(not(no_global_oom_handling))] #[inline] @@ -473,17 +486,25 @@ impl<T> Vec<T> { Self::with_capacity_in(capacity, Global) } - /// Tries to construct a new, empty `Vec<T>` with the specified capacity. + /// Tries to construct a new, empty `Vec<T>` with at least the specified capacity. /// - /// The vector will be able to hold exactly `capacity` elements without - /// reallocating. If `capacity` is 0, the vector will not allocate. + /// The vector will be able to hold at least `capacity` elements without + /// reallocating. This method is allowed to allocate for more elements than + /// `capacity`. If `capacity` is 0, the vector will not allocate. /// /// It is important to note that although the returned vector has the - /// *capacity* specified, the vector will have a zero *length*. For an - /// explanation of the difference between length and capacity, see + /// minimum *capacity* specified, the vector will have a zero *length*. For + /// an explanation of the difference between length and capacity, see /// *[Capacity and reallocation]*. /// + /// If it is important to know the exact allocated capacity of a `Vec`, + /// always use the [`capacity`] method after construction. + /// + /// For `Vec<T>` where `T` is a zero-sized type, there will be no allocation + /// and the capacity will always be `usize::MAX`. + /// /// [Capacity and reallocation]: #capacity-and-reallocation + /// [`capacity`]: Vec::capacity /// /// # Examples /// @@ -492,14 +513,14 @@ impl<T> Vec<T> { /// /// // The vector contains no items, even though it has capacity for more /// assert_eq!(vec.len(), 0); - /// assert_eq!(vec.capacity(), 10); + /// assert!(vec.capacity() >= 10); /// /// // These are all done without reallocating... /// for i in 0..10 { /// vec.push(i); /// } /// assert_eq!(vec.len(), 10); - /// assert_eq!(vec.capacity(), 10); + /// assert!(vec.capacity() >= 10); /// /// // ...but this may make the vector reallocate /// vec.push(11); @@ -508,6 +529,11 @@ impl<T> Vec<T> { /// /// let mut result = Vec::try_with_capacity(usize::MAX); /// assert!(result.is_err()); + /// + /// // A vector of a zero-sized type will always over-allocate, since no + /// // allocation is necessary + /// let vec_units = Vec::<()>::try_with_capacity(10).unwrap(); + /// assert_eq!(vec_units.capacity(), usize::MAX); /// ``` #[inline] #[stable(feature = "kernel", since = "1.0.0")] @@ -515,15 +541,15 @@ impl<T> Vec<T> { Self::try_with_capacity_in(capacity, Global) } - /// Creates a `Vec<T>` directly from the raw components of another vector. + /// Creates a `Vec<T>` directly from a pointer, a capacity, and a length. /// /// # Safety /// /// This is highly unsafe, due to the number of invariants that aren't /// checked: /// - /// * `ptr` needs to have been previously allocated via [`String`]/`Vec<T>` - /// (at least, it's highly likely to be incorrect if it wasn't). + /// * `ptr` must have been allocated using the global allocator, such as via + /// the [`alloc::alloc`] function. /// * `T` needs to have the same alignment as what `ptr` was allocated with. /// (`T` having a less strict alignment is not sufficient, the alignment really /// needs to be equal to satisfy the [`dealloc`] requirement that memory must be @@ -532,6 +558,14 @@ impl<T> Vec<T> { /// to be the same size as the pointer was allocated with. (Because similar to /// alignment, [`dealloc`] must be called with the same layout `size`.) /// * `length` needs to be less than or equal to `capacity`. + /// * The first `length` values must be properly initialized values of type `T`. + /// * `capacity` needs to be the capacity that the pointer was allocated with. + /// * The allocated size in bytes must be no larger than `isize::MAX`. + /// See the safety documentation of [`pointer::offset`]. + /// + /// These requirements are always upheld by any `ptr` that has been allocated + /// via `Vec<T>`. Other allocation sources are allowed if the invariants are + /// upheld. /// /// Violating these may cause problems like corrupting the allocator's /// internal data structures. For example it is normally **not** safe @@ -552,6 +586,7 @@ impl<T> Vec<T> { /// function. /// /// [`String`]: crate::string::String + /// [`alloc::alloc`]: crate::alloc::alloc /// [`dealloc`]: crate::alloc::GlobalAlloc::dealloc /// /// # Examples @@ -574,8 +609,8 @@ impl<T> Vec<T> { /// /// unsafe { /// // Overwrite memory with 4, 5, 6 - /// for i in 0..len as isize { - /// ptr::write(p.offset(i), 4 + i); + /// for i in 0..len { + /// ptr::write(p.add(i), 4 + i); /// } /// /// // Put everything back together into a Vec @@ -583,6 +618,32 @@ impl<T> Vec<T> { /// assert_eq!(rebuilt, [4, 5, 6]); /// } /// ``` + /// + /// Using memory that was allocated elsewhere: + /// + /// ```rust + /// #![feature(allocator_api)] + /// + /// use std::alloc::{AllocError, Allocator, Global, Layout}; + /// + /// fn main() { + /// let layout = Layout::array::<u32>(16).expect("overflow cannot happen"); + /// + /// let vec = unsafe { + /// let mem = match Global.allocate(layout) { + /// Ok(mem) => mem.cast::<u32>().as_ptr(), + /// Err(AllocError) => return, + /// }; + /// + /// mem.write(1_000_000); + /// + /// Vec::from_raw_parts_in(mem, 1, 16, Global) + /// }; + /// + /// assert_eq!(vec, &[1_000_000]); + /// assert_eq!(vec.capacity(), 16); + /// } + /// ``` #[inline] #[stable(feature = "rust1", since = "1.0.0")] pub unsafe fn from_raw_parts(ptr: *mut T, length: usize, capacity: usize) -> Self { @@ -611,18 +672,26 @@ impl<T, A: Allocator> Vec<T, A> { Vec { buf: RawVec::new_in(alloc), len: 0 } } - /// Constructs a new, empty `Vec<T, A>` with the specified capacity with the provided - /// allocator. + /// Constructs a new, empty `Vec<T, A>` with at least the specified capacity + /// with the provided allocator. /// - /// The vector will be able to hold exactly `capacity` elements without - /// reallocating. If `capacity` is 0, the vector will not allocate. + /// The vector will be able to hold at least `capacity` elements without + /// reallocating. This method is allowed to allocate for more elements than + /// `capacity`. If `capacity` is 0, the vector will not allocate. /// /// It is important to note that although the returned vector has the - /// *capacity* specified, the vector will have a zero *length*. For an - /// explanation of the difference between length and capacity, see + /// minimum *capacity* specified, the vector will have a zero *length*. For + /// an explanation of the difference between length and capacity, see /// *[Capacity and reallocation]*. /// + /// If it is important to know the exact allocated capacity of a `Vec`, + /// always use the [`capacity`] method after construction. + /// + /// For `Vec<T, A>` where `T` is a zero-sized type, there will be no allocation + /// and the capacity will always be `usize::MAX`. + /// /// [Capacity and reallocation]: #capacity-and-reallocation + /// [`capacity`]: Vec::capacity /// /// # Panics /// @@ -652,6 +721,11 @@ impl<T, A: Allocator> Vec<T, A> { /// vec.push(11); /// assert_eq!(vec.len(), 11); /// assert!(vec.capacity() >= 11); + /// + /// // A vector of a zero-sized type will always over-allocate, since no + /// // allocation is necessary + /// let vec_units = Vec::<(), System>::with_capacity_in(10, System); + /// assert_eq!(vec_units.capacity(), usize::MAX); /// ``` #[cfg(not(no_global_oom_handling))] #[inline] @@ -660,18 +734,26 @@ impl<T, A: Allocator> Vec<T, A> { Vec { buf: RawVec::with_capacity_in(capacity, alloc), len: 0 } } - /// Tries to construct a new, empty `Vec<T, A>` with the specified capacity + /// Tries to construct a new, empty `Vec<T, A>` with at least the specified capacity /// with the provided allocator. /// - /// The vector will be able to hold exactly `capacity` elements without - /// reallocating. If `capacity` is 0, the vector will not allocate. + /// The vector will be able to hold at least `capacity` elements without + /// reallocating. This method is allowed to allocate for more elements than + /// `capacity`. If `capacity` is 0, the vector will not allocate. /// /// It is important to note that although the returned vector has the - /// *capacity* specified, the vector will have a zero *length*. For an - /// explanation of the difference between length and capacity, see + /// minimum *capacity* specified, the vector will have a zero *length*. For + /// an explanation of the difference between length and capacity, see /// *[Capacity and reallocation]*. /// + /// If it is important to know the exact allocated capacity of a `Vec`, + /// always use the [`capacity`] method after construction. + /// + /// For `Vec<T, A>` where `T` is a zero-sized type, there will be no allocation + /// and the capacity will always be `usize::MAX`. + /// /// [Capacity and reallocation]: #capacity-and-reallocation + /// [`capacity`]: Vec::capacity /// /// # Examples /// @@ -700,6 +782,11 @@ impl<T, A: Allocator> Vec<T, A> { /// /// let mut result = Vec::try_with_capacity_in(usize::MAX, System); /// assert!(result.is_err()); + /// + /// // A vector of a zero-sized type will always over-allocate, since no + /// // allocation is necessary + /// let vec_units = Vec::<(), System>::try_with_capacity_in(10, System).unwrap(); + /// assert_eq!(vec_units.capacity(), usize::MAX); /// ``` #[inline] #[stable(feature = "kernel", since = "1.0.0")] @@ -707,21 +794,31 @@ impl<T, A: Allocator> Vec<T, A> { Ok(Vec { buf: RawVec::try_with_capacity_in(capacity, alloc)?, len: 0 }) } - /// Creates a `Vec<T, A>` directly from the raw components of another vector. + /// Creates a `Vec<T, A>` directly from a pointer, a capacity, a length, + /// and an allocator. /// /// # Safety /// /// This is highly unsafe, due to the number of invariants that aren't /// checked: /// - /// * `ptr` needs to have been previously allocated via [`String`]/`Vec<T>` - /// (at least, it's highly likely to be incorrect if it wasn't). - /// * `T` needs to have the same size and alignment as what `ptr` was allocated with. + /// * `ptr` must be [*currently allocated*] via the given allocator `alloc`. + /// * `T` needs to have the same alignment as what `ptr` was allocated with. /// (`T` having a less strict alignment is not sufficient, the alignment really /// needs to be equal to satisfy the [`dealloc`] requirement that memory must be /// allocated and deallocated with the same layout.) + /// * The size of `T` times the `capacity` (ie. the allocated size in bytes) needs + /// to be the same size as the pointer was allocated with. (Because similar to + /// alignment, [`dealloc`] must be called with the same layout `size`.) /// * `length` needs to be less than or equal to `capacity`. - /// * `capacity` needs to be the capacity that the pointer was allocated with. + /// * The first `length` values must be properly initialized values of type `T`. + /// * `capacity` needs to [*fit*] the layout size that the pointer was allocated with. + /// * The allocated size in bytes must be no larger than `isize::MAX`. + /// See the safety documentation of [`pointer::offset`]. + /// + /// These requirements are always upheld by any `ptr` that has been allocated + /// via `Vec<T, A>`. Other allocation sources are allowed if the invariants are + /// upheld. /// /// Violating these may cause problems like corrupting the allocator's /// internal data structures. For example it is **not** safe @@ -739,6 +836,8 @@ impl<T, A: Allocator> Vec<T, A> { /// /// [`String`]: crate::string::String /// [`dealloc`]: crate::alloc::GlobalAlloc::dealloc + /// [*currently allocated*]: crate::alloc::Allocator#currently-allocated-memory + /// [*fit*]: crate::alloc::Allocator#memory-fitting /// /// # Examples /// @@ -768,8 +867,8 @@ impl<T, A: Allocator> Vec<T, A> { /// /// unsafe { /// // Overwrite memory with 4, 5, 6 - /// for i in 0..len as isize { - /// ptr::write(p.offset(i), 4 + i); + /// for i in 0..len { + /// ptr::write(p.add(i), 4 + i); /// } /// /// // Put everything back together into a Vec @@ -777,6 +876,29 @@ impl<T, A: Allocator> Vec<T, A> { /// assert_eq!(rebuilt, [4, 5, 6]); /// } /// ``` + /// + /// Using memory that was allocated elsewhere: + /// + /// ```rust + /// use std::alloc::{alloc, Layout}; + /// + /// fn main() { + /// let layout = Layout::array::<u32>(16).expect("overflow cannot happen"); + /// let vec = unsafe { + /// let mem = alloc(layout).cast::<u32>(); + /// if mem.is_null() { + /// return; + /// } + /// + /// mem.write(1_000_000); + /// + /// Vec::from_raw_parts(mem, 1, 16) + /// }; + /// + /// assert_eq!(vec, &[1_000_000]); + /// assert_eq!(vec.capacity(), 16); + /// } + /// ``` #[inline] #[unstable(feature = "allocator_api", issue = "32838")] pub unsafe fn from_raw_parts_in(ptr: *mut T, length: usize, capacity: usize, alloc: A) -> Self { @@ -869,13 +991,14 @@ impl<T, A: Allocator> Vec<T, A> { (ptr, len, capacity, alloc) } - /// Returns the number of elements the vector can hold without + /// Returns the total number of elements the vector can hold without /// reallocating. /// /// # Examples /// /// ``` - /// let vec: Vec<i32> = Vec::with_capacity(10); + /// let mut vec: Vec<i32> = Vec::with_capacity(10); + /// vec.push(42); /// assert_eq!(vec.capacity(), 10); /// ``` #[inline] @@ -885,10 +1008,10 @@ impl<T, A: Allocator> Vec<T, A> { } /// Reserves capacity for at least `additional` more elements to be inserted - /// in the given `Vec<T>`. The collection may reserve more space to avoid - /// frequent reallocations. After calling `reserve`, capacity will be - /// greater than or equal to `self.len() + additional`. Does nothing if - /// capacity is already sufficient. + /// in the given `Vec<T>`. The collection may reserve more space to + /// speculatively avoid frequent reallocations. After calling `reserve`, + /// capacity will be greater than or equal to `self.len() + additional`. + /// Does nothing if capacity is already sufficient. /// /// # Panics /// @@ -907,10 +1030,12 @@ impl<T, A: Allocator> Vec<T, A> { self.buf.reserve(self.len, additional); } - /// Reserves the minimum capacity for exactly `additional` more elements to - /// be inserted in the given `Vec<T>`. After calling `reserve_exact`, - /// capacity will be greater than or equal to `self.len() + additional`. - /// Does nothing if the capacity is already sufficient. + /// Reserves the minimum capacity for at least `additional` more elements to + /// be inserted in the given `Vec<T>`. Unlike [`reserve`], this will not + /// deliberately over-allocate to speculatively avoid frequent allocations. + /// After calling `reserve_exact`, capacity will be greater than or equal to + /// `self.len() + additional`. Does nothing if the capacity is already + /// sufficient. /// /// Note that the allocator may give the collection more space than it /// requests. Therefore, capacity can not be relied upon to be precisely @@ -936,10 +1061,11 @@ impl<T, A: Allocator> Vec<T, A> { } /// Tries to reserve capacity for at least `additional` more elements to be inserted - /// in the given `Vec<T>`. The collection may reserve more space to avoid + /// in the given `Vec<T>`. The collection may reserve more space to speculatively avoid /// frequent reallocations. After calling `try_reserve`, capacity will be - /// greater than or equal to `self.len() + additional`. Does nothing if - /// capacity is already sufficient. + /// greater than or equal to `self.len() + additional` if it returns + /// `Ok(())`. Does nothing if capacity is already sufficient. This method + /// preserves the contents even if an error occurs. /// /// # Errors /// @@ -971,10 +1097,11 @@ impl<T, A: Allocator> Vec<T, A> { self.buf.try_reserve(self.len, additional) } - /// Tries to reserve the minimum capacity for exactly `additional` - /// elements to be inserted in the given `Vec<T>`. After calling - /// `try_reserve_exact`, capacity will be greater than or equal to - /// `self.len() + additional` if it returns `Ok(())`. + /// Tries to reserve the minimum capacity for at least `additional` + /// elements to be inserted in the given `Vec<T>`. Unlike [`try_reserve`], + /// this will not deliberately over-allocate to speculatively avoid frequent + /// allocations. After calling `try_reserve_exact`, capacity will be greater + /// than or equal to `self.len() + additional` if it returns `Ok(())`. /// Does nothing if the capacity is already sufficient. /// /// Note that the allocator may give the collection more space than it @@ -1066,7 +1193,8 @@ impl<T, A: Allocator> Vec<T, A> { /// Converts the vector into [`Box<[T]>`][owned slice]. /// - /// Note that this will drop any excess capacity. + /// If the vector has excess capacity, its items will be moved into a + /// newly-allocated buffer with exactly the right capacity. /// /// [owned slice]: Box /// @@ -1199,7 +1327,8 @@ impl<T, A: Allocator> Vec<T, A> { self } - /// Returns a raw pointer to the vector's buffer. + /// Returns a raw pointer to the vector's buffer, or a dangling raw pointer + /// valid for zero sized reads if the vector didn't allocate. /// /// The caller must ensure that the vector outlives the pointer this /// function returns, or else it will end up pointing to garbage. @@ -1236,7 +1365,8 @@ impl<T, A: Allocator> Vec<T, A> { ptr } - /// Returns an unsafe mutable pointer to the vector's buffer. + /// Returns an unsafe mutable pointer to the vector's buffer, or a dangling + /// raw pointer valid for zero sized reads if the vector didn't allocate. /// /// The caller must ensure that the vector outlives the pointer this /// function returns, or else it will end up pointing to garbage. @@ -1440,9 +1570,6 @@ impl<T, A: Allocator> Vec<T, A> { } let len = self.len(); - if index > len { - assert_failed(index, len); - } // space for the new element if len == self.buf.capacity() { @@ -1454,9 +1581,15 @@ impl<T, A: Allocator> Vec<T, A> { // The spot to put the new value { let p = self.as_mut_ptr().add(index); - // Shift everything over to make space. (Duplicating the - // `index`th element into two consecutive places.) - ptr::copy(p, p.offset(1), len - index); + if index < len { + // Shift everything over to make space. (Duplicating the + // `index`th element into two consecutive places.) + ptr::copy(p, p.add(1), len - index); + } else if index == len { + // No elements need shifting. + } else { + assert_failed(index, len); + } // Write it in, overwriting the first copy of the `index`th // element. ptr::write(p, element); @@ -1513,7 +1646,7 @@ impl<T, A: Allocator> Vec<T, A> { ret = ptr::read(ptr); // Shift everything down to fill in that spot. - ptr::copy(ptr.offset(1), ptr, len - index - 1); + ptr::copy(ptr.add(1), ptr, len - index - 1); } self.set_len(len - 1); ret @@ -1562,11 +1695,11 @@ impl<T, A: Allocator> Vec<T, A> { /// /// ``` /// let mut vec = vec![1, 2, 3, 4]; - /// vec.retain_mut(|x| if *x > 3 { - /// false - /// } else { + /// vec.retain_mut(|x| if *x <= 3 { /// *x += 1; /// true + /// } else { + /// false /// }); /// assert_eq!(vec, [2, 3, 4]); /// ``` @@ -1854,6 +1987,51 @@ impl<T, A: Allocator> Vec<T, A> { Ok(()) } + /// Appends an element if there is sufficient spare capacity, otherwise an error is returned + /// with the element. + /// + /// Unlike [`push`] this method will not reallocate when there's insufficient capacity. + /// The caller should use [`reserve`] or [`try_reserve`] to ensure that there is enough capacity. + /// + /// [`push`]: Vec::push + /// [`reserve`]: Vec::reserve + /// [`try_reserve`]: Vec::try_reserve + /// + /// # Examples + /// + /// A manual, panic-free alternative to [`FromIterator`]: + /// + /// ``` + /// #![feature(vec_push_within_capacity)] + /// + /// use std::collections::TryReserveError; + /// fn from_iter_fallible<T>(iter: impl Iterator<Item=T>) -> Result<Vec<T>, TryReserveError> { + /// let mut vec = Vec::new(); + /// for value in iter { + /// if let Err(value) = vec.push_within_capacity(value) { + /// vec.try_reserve(1)?; + /// // this cannot fail, the previous line either returned or added at least 1 free slot + /// let _ = vec.push_within_capacity(value); + /// } + /// } + /// Ok(vec) + /// } + /// assert_eq!(from_iter_fallible(0..100), Ok(Vec::from_iter(0..100))); + /// ``` + #[inline] + #[unstable(feature = "vec_push_within_capacity", issue = "100486")] + pub fn push_within_capacity(&mut self, value: T) -> Result<(), T> { + if self.len == self.buf.capacity() { + return Err(value); + } + unsafe { + let end = self.as_mut_ptr().add(self.len); + ptr::write(end, value); + self.len += 1; + } + Ok(()) + } + /// Removes the last element from a vector and returns it, or [`None`] if it /// is empty. /// @@ -1886,7 +2064,7 @@ impl<T, A: Allocator> Vec<T, A> { /// /// # Panics /// - /// Panics if the number of elements in the vector overflows a `usize`. + /// Panics if the new capacity exceeds `isize::MAX` bytes. /// /// # Examples /// @@ -1980,9 +2158,7 @@ impl<T, A: Allocator> Vec<T, A> { unsafe { // set self.vec length's to start, to be safe in case Drain is leaked self.set_len(start); - // Use the borrow in the IterMut to indicate borrowing behavior of the - // whole Drain iterator (like &mut T). - let range_slice = slice::from_raw_parts_mut(self.as_mut_ptr().add(start), end - start); + let range_slice = slice::from_raw_parts(self.as_ptr().add(start), end - start); Drain { tail_start: end, tail_len: len - end, @@ -2145,7 +2321,7 @@ impl<T, A: Allocator> Vec<T, A> { { let len = self.len(); if new_len > len { - self.extend_with(new_len - len, ExtendFunc(f)); + self.extend_trusted(iter::repeat_with(f).take(new_len - len)); } else { self.truncate(new_len); } @@ -2174,7 +2350,6 @@ impl<T, A: Allocator> Vec<T, A> { /// static_ref[0] += 1; /// assert_eq!(static_ref, &[2, 2, 3]); /// ``` - #[cfg(not(no_global_oom_handling))] #[stable(feature = "vec_leak", since = "1.47.0")] #[inline] pub fn leak<'a>(self) -> &'a mut [T] @@ -2469,7 +2644,7 @@ impl<T: Clone, A: Allocator> Vec<T, A> { self.reserve(range.len()); // SAFETY: - // - `slice::range` guarantees that the given range is valid for indexing self + // - `slice::range` guarantees that the given range is valid for indexing self unsafe { self.spec_extend_from_within(range); } @@ -2501,7 +2676,7 @@ impl<T, A: Allocator, const N: usize> Vec<[T; N], A> { #[unstable(feature = "slice_flatten", issue = "95629")] pub fn into_flattened(self) -> Vec<T, A> { let (ptr, len, cap, alloc) = self.into_raw_parts_with_alloc(); - let (new_len, new_cap) = if mem::size_of::<T>() == 0 { + let (new_len, new_cap) = if T::IS_ZST { (len.checked_mul(N).expect("vec len overflow"), usize::MAX) } else { // SAFETY: @@ -2537,16 +2712,6 @@ impl<T: Clone> ExtendWith<T> for ExtendElement<T> { } } -struct ExtendFunc<F>(F); -impl<T, F: FnMut() -> T> ExtendWith<T> for ExtendFunc<F> { - fn next(&mut self) -> T { - (self.0)() - } - fn last(mut self) -> T { - (self.0)() - } -} - impl<T, A: Allocator> Vec<T, A> { #[cfg(not(no_global_oom_handling))] /// Extend the vector by `n` values, using the given generator. @@ -2563,7 +2728,7 @@ impl<T, A: Allocator> Vec<T, A> { // Write all elements except the last one for _ in 1..n { ptr::write(ptr, value.next()); - ptr = ptr.offset(1); + ptr = ptr.add(1); // Increment the length in every step in case next() panics local_len.increment_len(1); } @@ -2592,7 +2757,7 @@ impl<T, A: Allocator> Vec<T, A> { // Write all elements except the last one for _ in 1..n { ptr::write(ptr, value.next()); - ptr = ptr.offset(1); + ptr = ptr.add(1); // Increment the length in every step in case next() panics local_len.increment_len(1); } @@ -2664,7 +2829,7 @@ impl<T: Clone, A: Allocator> ExtendFromWithinSpec for Vec<T, A> { let (this, spare, len) = unsafe { self.split_at_spare_mut_with_len() }; // SAFETY: - // - caller guaratees that src is a valid index + // - caller guarantees that src is a valid index let to_clone = unsafe { this.get_unchecked(src) }; iter::zip(to_clone, spare) @@ -2683,13 +2848,13 @@ impl<T: Copy, A: Allocator> ExtendFromWithinSpec for Vec<T, A> { let (init, spare) = self.split_at_spare_mut(); // SAFETY: - // - caller guaratees that `src` is a valid index + // - caller guarantees that `src` is a valid index let source = unsafe { init.get_unchecked(src) }; // SAFETY: // - Both pointers are created from unique slice references (`&mut [_]`) // so they are valid and do not overlap. - // - Elements are :Copy so it's OK to to copy them, without doing + // - Elements are :Copy so it's OK to copy them, without doing // anything with the original values // - `count` is equal to the len of `source`, so source is valid for // `count` reads @@ -2712,6 +2877,7 @@ impl<T: Copy, A: Allocator> ExtendFromWithinSpec for Vec<T, A> { impl<T, A: Allocator> ops::Deref for Vec<T, A> { type Target = [T]; + #[inline] fn deref(&self) -> &[T] { unsafe { slice::from_raw_parts(self.as_ptr(), self.len) } } @@ -2719,6 +2885,7 @@ impl<T, A: Allocator> ops::Deref for Vec<T, A> { #[stable(feature = "rust1", since = "1.0.0")] impl<T, A: Allocator> ops::DerefMut for Vec<T, A> { + #[inline] fn deref_mut(&mut self) -> &mut [T] { unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) } } @@ -2764,7 +2931,7 @@ impl<T: Clone, A: Allocator + Clone> Clone for Vec<T, A> { // HACK(japaric): with cfg(test) the inherent `[T]::to_vec` method, which is // required for this method definition, is not available. Instead use the - // `slice::to_vec` function which is only available with cfg(test) + // `slice::to_vec` function which is only available with cfg(test) // NB see the slice::hack module in slice.rs for more information #[cfg(test)] fn clone(&self) -> Self { @@ -2845,19 +3012,22 @@ impl<T, A: Allocator> IntoIterator for Vec<T, A> { /// /// ``` /// let v = vec!["a".to_string(), "b".to_string()]; - /// for s in v.into_iter() { - /// // s has type String, not &String - /// println!("{s}"); - /// } + /// let mut v_iter = v.into_iter(); + /// + /// let first_element: Option<String> = v_iter.next(); + /// + /// assert_eq!(first_element, Some("a".to_string())); + /// assert_eq!(v_iter.next(), Some("b".to_string())); + /// assert_eq!(v_iter.next(), None); /// ``` #[inline] - fn into_iter(self) -> IntoIter<T, A> { + fn into_iter(self) -> Self::IntoIter { unsafe { let mut me = ManuallyDrop::new(self); let alloc = ManuallyDrop::new(ptr::read(me.allocator())); let begin = me.as_mut_ptr(); - let end = if mem::size_of::<T>() == 0 { - arith_offset(begin as *const i8, me.len() as isize) as *const T + let end = if T::IS_ZST { + begin.wrapping_byte_add(me.len()) } else { begin.add(me.len()) as *const T }; @@ -2879,7 +3049,7 @@ impl<'a, T, A: Allocator> IntoIterator for &'a Vec<T, A> { type Item = &'a T; type IntoIter = slice::Iter<'a, T>; - fn into_iter(self) -> slice::Iter<'a, T> { + fn into_iter(self) -> Self::IntoIter { self.iter() } } @@ -2889,7 +3059,7 @@ impl<'a, T, A: Allocator> IntoIterator for &'a mut Vec<T, A> { type Item = &'a mut T; type IntoIter = slice::IterMut<'a, T>; - fn into_iter(self) -> slice::IterMut<'a, T> { + fn into_iter(self) -> Self::IntoIter { self.iter_mut() } } @@ -2969,6 +3139,69 @@ impl<T, A: Allocator> Vec<T, A> { Ok(()) } + // specific extend for `TrustedLen` iterators, called both by the specializations + // and internal places where resolving specialization makes compilation slower + #[cfg(not(no_global_oom_handling))] + fn extend_trusted(&mut self, iterator: impl iter::TrustedLen<Item = T>) { + let (low, high) = iterator.size_hint(); + if let Some(additional) = high { + debug_assert_eq!( + low, + additional, + "TrustedLen iterator's size hint is not exact: {:?}", + (low, high) + ); + self.reserve(additional); + unsafe { + let ptr = self.as_mut_ptr(); + let mut local_len = SetLenOnDrop::new(&mut self.len); + iterator.for_each(move |element| { + ptr::write(ptr.add(local_len.current_len()), element); + // Since the loop executes user code which can panic we have to update + // the length every step to correctly drop what we've written. + // NB can't overflow since we would have had to alloc the address space + local_len.increment_len(1); + }); + } + } else { + // Per TrustedLen contract a `None` upper bound means that the iterator length + // truly exceeds usize::MAX, which would eventually lead to a capacity overflow anyway. + // Since the other branch already panics eagerly (via `reserve()`) we do the same here. + // This avoids additional codegen for a fallback code path which would eventually + // panic anyway. + panic!("capacity overflow"); + } + } + + // specific extend for `TrustedLen` iterators, called both by the specializations + // and internal places where resolving specialization makes compilation slower + fn try_extend_trusted(&mut self, iterator: impl iter::TrustedLen<Item = T>) -> Result<(), TryReserveError> { + let (low, high) = iterator.size_hint(); + if let Some(additional) = high { + debug_assert_eq!( + low, + additional, + "TrustedLen iterator's size hint is not exact: {:?}", + (low, high) + ); + self.try_reserve(additional)?; + unsafe { + let ptr = self.as_mut_ptr(); + let mut local_len = SetLenOnDrop::new(&mut self.len); + iterator.for_each(move |element| { + ptr::write(ptr.add(local_len.current_len()), element); + // Since the loop executes user code which can panic we have to update + // the length every step to correctly drop what we've written. + // NB can't overflow since we would have had to alloc the address space + local_len.increment_len(1); + }); + } + Ok(()) + } else { + Err(TryReserveErrorKind::CapacityOverflow.into()) + } + } + /// Creates a splicing iterator that replaces the specified range in the vector /// with the given `replace_with` iterator and yields the removed items. /// `replace_with` does not need to be the same length as `range`. @@ -3135,6 +3368,8 @@ unsafe impl<#[may_dangle] T, A: Allocator> Drop for Vec<T, A> { #[rustc_const_unstable(feature = "const_default_impls", issue = "87864")] impl<T> const Default for Vec<T> { /// Creates an empty `Vec<T>`. + /// + /// The vector will not allocate until elements are pushed onto it. fn default() -> Vec<T> { Vec::new() } @@ -3227,12 +3462,15 @@ impl<T, const N: usize> From<[T; N]> for Vec<T> { /// ``` #[cfg(not(test))] fn from(s: [T; N]) -> Vec<T> { - <[T]>::into_vec(box s) + <[T]>::into_vec( + #[rustc_box] + Box::new(s), + ) } #[cfg(test)] fn from(s: [T; N]) -> Vec<T> { - crate::slice::into_vec(box s) + crate::slice::into_vec(Box::new(s)) } } @@ -3261,7 +3499,7 @@ where } } -// note: test pulls in libstd, which causes errors here +// note: test pulls in std, which causes errors here #[cfg(not(test))] #[stable(feature = "vec_from_box", since = "1.18.0")] impl<T, A: Allocator> From<Box<[T], A>> for Vec<T, A> { @@ -3279,7 +3517,7 @@ impl<T, A: Allocator> From<Box<[T], A>> for Vec<T, A> { } } -// note: test pulls in libstd, which causes errors here +// note: test pulls in std, which causes errors here #[cfg(not(no_global_oom_handling))] #[cfg(not(test))] #[stable(feature = "box_from_vec", since = "1.20.0")] @@ -3294,6 +3532,14 @@ impl<T, A: Allocator> From<Vec<T, A>> for Box<[T], A> { /// ``` /// assert_eq!(Box::from(vec![1, 2, 3]), vec![1, 2, 3].into_boxed_slice()); /// ``` + /// + /// Any excess capacity is removed: + /// ``` + /// let mut vec = Vec::with_capacity(10); + /// vec.extend([1, 2, 3]); + /// + /// assert_eq!(Box::from(vec), vec![1, 2, 3].into_boxed_slice()); + /// ``` fn from(v: Vec<T, A>) -> Self { v.into_boxed_slice() } diff --git a/rust/alloc/vec/set_len_on_drop.rs b/rust/alloc/vec/set_len_on_drop.rs index 448bf5076a0b..d3c7297b80ec 100644 --- a/rust/alloc/vec/set_len_on_drop.rs +++ b/rust/alloc/vec/set_len_on_drop.rs @@ -20,6 +20,11 @@ impl<'a> SetLenOnDrop<'a> { pub(super) fn increment_len(&mut self, increment: usize) { self.local_len += increment; } + + #[inline] + pub(super) fn current_len(&self) -> usize { + self.local_len + } } impl Drop for SetLenOnDrop<'_> { diff --git a/rust/alloc/vec/spec_extend.rs b/rust/alloc/vec/spec_extend.rs index 5ce2d00991bc..a6a735201e59 100644 --- a/rust/alloc/vec/spec_extend.rs +++ b/rust/alloc/vec/spec_extend.rs @@ -1,12 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT use crate::alloc::Allocator; -use crate::collections::{TryReserveError, TryReserveErrorKind}; +use crate::collections::TryReserveError; use core::iter::TrustedLen; -use core::ptr::{self}; use core::slice::{self}; -use super::{IntoIter, SetLenOnDrop, Vec}; +use super::{IntoIter, Vec}; // Specialization trait used for Vec::extend #[cfg(not(no_global_oom_handling))] @@ -44,36 +43,7 @@ where I: TrustedLen<Item = T>, { default fn spec_extend(&mut self, iterator: I) { - // This is the case for a TrustedLen iterator. - let (low, high) = iterator.size_hint(); - if let Some(additional) = high { - debug_assert_eq!( - low, - additional, - "TrustedLen iterator's size hint is not exact: {:?}", - (low, high) - ); - self.reserve(additional); - unsafe { - let mut ptr = self.as_mut_ptr().add(self.len()); - let mut local_len = SetLenOnDrop::new(&mut self.len); - iterator.for_each(move |element| { - ptr::write(ptr, element); - ptr = ptr.offset(1); - // Since the loop executes user code which can panic we have to bump the pointer - // after each step. - // NB can't overflow since we would have had to alloc the address space - local_len.increment_len(1); - }); - } - } else { - // Per TrustedLen contract a `None` upper bound means that the iterator length - // truly exceeds usize::MAX, which would eventually lead to a capacity overflow anyway. - // Since the other branch already panics eagerly (via `reserve()`) we do the same here. - // This avoids additional codegen for a fallback code path which would eventually - // panic anyway. - panic!("capacity overflow"); - } + self.extend_trusted(iterator) } } @@ -82,32 +52,7 @@ where I: TrustedLen<Item = T>, { default fn try_spec_extend(&mut self, iterator: I) -> Result<(), TryReserveError> { - // This is the case for a TrustedLen iterator. - let (low, high) = iterator.size_hint(); - if let Some(additional) = high { - debug_assert_eq!( - low, - additional, - "TrustedLen iterator's size hint is not exact: {:?}", - (low, high) - ); - self.try_reserve(additional)?; - unsafe { - let mut ptr = self.as_mut_ptr().add(self.len()); - let mut local_len = SetLenOnDrop::new(&mut self.len); - iterator.for_each(move |element| { - ptr::write(ptr, element); - ptr = ptr.offset(1); - // Since the loop executes user code which can panic we have to bump the pointer - // after each step. - // NB can't overflow since we would have had to alloc the address space - local_len.increment_len(1); - }); - } - Ok(()) - } else { - Err(TryReserveErrorKind::CapacityOverflow.into()) - } + self.try_extend_trusted(iterator) } } diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 50e7a76d5455..3e601ce2548d 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -6,6 +6,7 @@ * Sorted alphabetically. */ +#include <linux/errname.h> #include <linux/slab.h> #include <linux/refcount.h> #include <linux/wait.h> diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index 7b246454e009..9bcbea04dac3 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -9,7 +9,6 @@ //! using this crate. #![no_std] -#![feature(core_ffi_c)] // See <https://github.com/rust-lang/rust-bindgen/issues/1651>. #![cfg_attr(test, allow(deref_nullptr))] #![cfg_attr(test, allow(unaligned_references))] diff --git a/rust/helpers.c b/rust/helpers.c index 81e80261d597..bb594da56137 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -21,6 +21,7 @@ #include <linux/bug.h> #include <linux/build_bug.h> #include <linux/err.h> +#include <linux/errname.h> #include <linux/refcount.h> #include <linux/mutex.h> #include <linux/spinlock.h> @@ -110,6 +111,12 @@ long rust_helper_PTR_ERR(__force const void *ptr) } EXPORT_SYMBOL_GPL(rust_helper_PTR_ERR); +const char *rust_helper_errname(int err) +{ + return errname(err); +} +EXPORT_SYMBOL_GPL(rust_helper_errname); + struct task_struct *rust_helper_get_current(void) { return current; diff --git a/rust/kernel/build_assert.rs b/rust/kernel/build_assert.rs index 659542393c09..9e37120bc69c 100644 --- a/rust/kernel/build_assert.rs +++ b/rust/kernel/build_assert.rs @@ -67,6 +67,8 @@ macro_rules! build_error { /// assert!(n > 1); // Run-time check /// } /// ``` +/// +/// [`static_assert!`]: crate::static_assert! #[macro_export] macro_rules! build_assert { ($cond:expr $(,)?) => {{ diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index 5f4114b30b94..05fcab6abfe6 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -4,16 +4,20 @@ //! //! C header: [`include/uapi/asm-generic/errno-base.h`](../../../include/uapi/asm-generic/errno-base.h) +use crate::str::CStr; + use alloc::{ alloc::{AllocError, LayoutError}, collections::TryReserveError, }; use core::convert::From; +use core::fmt; use core::num::TryFromIntError; use core::str::Utf8Error; /// Contains the C-compatible error codes. +#[rustfmt::skip] pub mod code { macro_rules! declare_err { ($err:tt $(,)? $($doc:expr),+) => { @@ -58,6 +62,25 @@ pub mod code { declare_err!(EPIPE, "Broken pipe."); declare_err!(EDOM, "Math argument out of domain of func."); declare_err!(ERANGE, "Math result not representable."); + declare_err!(ERESTARTSYS, "Restart the system call."); + declare_err!(ERESTARTNOINTR, "System call was interrupted by a signal and will be restarted."); + declare_err!(ERESTARTNOHAND, "Restart if no handler."); + declare_err!(ENOIOCTLCMD, "No ioctl command."); + declare_err!(ERESTART_RESTARTBLOCK, "Restart by calling sys_restart_syscall."); + declare_err!(EPROBE_DEFER, "Driver requests probe retry."); + declare_err!(EOPENSTALE, "Open found a stale dentry."); + declare_err!(ENOPARAM, "Parameter not supported."); + declare_err!(EBADHANDLE, "Illegal NFS file handle."); + declare_err!(ENOTSYNC, "Update synchronization mismatch."); + declare_err!(EBADCOOKIE, "Cookie is stale."); + declare_err!(ENOTSUPP, "Operation is not supported."); + declare_err!(ETOOSMALL, "Buffer or request is too small."); + declare_err!(ESERVERFAULT, "An untranslatable error occurred."); + declare_err!(EBADTYPE, "Type not supported by server."); + declare_err!(EJUKEBOX, "Request initiated, but will not complete before timeout."); + declare_err!(EIOCBQUEUED, "iocb queued, will get completion event."); + declare_err!(ERECALLCONFLICT, "Conflict with recalled state."); + declare_err!(ENOGRACE, "NFS file lock reclaim refused."); } /// Generic integer kernel error. @@ -113,6 +136,42 @@ impl Error { // SAFETY: self.0 is a valid error due to its invariant. unsafe { bindings::ERR_PTR(self.0.into()) as *mut _ } } + + /// Returns a string representing the error, if one exists. + #[cfg(not(testlib))] + pub fn name(&self) -> Option<&'static CStr> { + // SAFETY: Just an FFI call, there are no extra safety requirements. + let ptr = unsafe { bindings::errname(-self.0) }; + if ptr.is_null() { + None + } else { + // SAFETY: The string returned by `errname` is static and `NUL`-terminated. + Some(unsafe { CStr::from_char_ptr(ptr) }) + } + } + + /// Returns a string representing the error, if one exists. + /// + /// When `testlib` is configured, this always returns `None` to avoid the dependency on a + /// kernel function so that tests that use this (e.g., by calling [`Result::unwrap`]) can still + /// run in userspace. + #[cfg(testlib)] + pub fn name(&self) -> Option<&'static CStr> { + None + } +} + +impl fmt::Debug for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.name() { + // Print out number if no name can be found. + None => f.debug_tuple("Error").field(&-self.0).finish(), + // SAFETY: These strings are ASCII-only. + Some(name) => f + .debug_tuple(unsafe { core::str::from_utf8_unchecked(name) }) + .finish(), + } + } } impl From<AllocError> for Error { @@ -177,7 +236,7 @@ impl From<core::convert::Infallible> for Error { /// Note that even if a function does not return anything when it succeeds, /// it should still be modeled as returning a `Result` rather than /// just an [`Error`]. -pub type Result<T = ()> = core::result::Result<T, Error>; +pub type Result<T = (), E = Error> = core::result::Result<T, E>; /// Converts an integer as returned by a C kernel function to an error if it's negative, and /// `Ok(())` otherwise. diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs index 4ebfb08dab11..b4332a4ec1f4 100644 --- a/rust/kernel/init.rs +++ b/rust/kernel/init.rs @@ -197,6 +197,7 @@ //! [`Opaque`]: kernel::types::Opaque //! [`Opaque::ffi_init`]: kernel::types::Opaque::ffi_init //! [`pin_data`]: ::macros::pin_data +//! [`pin_init!`]: crate::pin_init! use crate::{ error::{self, Error}, @@ -255,6 +256,8 @@ pub mod macros; /// A normal `let` binding with optional type annotation. The expression is expected to implement /// [`PinInit`]/[`Init`] with the error type [`Infallible`]. If you want to use a different error /// type, then use [`stack_try_pin_init!`]. +/// +/// [`stack_try_pin_init!`]: crate::stack_try_pin_init! #[macro_export] macro_rules! stack_pin_init { (let $var:ident $(: $t:ty)? = $val:expr) => { @@ -804,6 +807,8 @@ macro_rules! try_pin_init { /// /// This initializer is for initializing data in-place that might later be moved. If you want to /// pin-initialize, use [`pin_init!`]. +/// +/// [`try_init!`]: crate::try_init! // For a detailed example of how this macro works, see the module documentation of the hidden // module `__internal` inside of `init/__internal.rs`. #[macro_export] diff --git a/rust/kernel/init/macros.rs b/rust/kernel/init/macros.rs index 541cfad1d8be..00aa4e956c0a 100644 --- a/rust/kernel/init/macros.rs +++ b/rust/kernel/init/macros.rs @@ -16,8 +16,9 @@ //! //! We will look at the following example: //! -//! ```rust +//! ```rust,ignore //! # use kernel::init::*; +//! # use core::pin::Pin; //! #[pin_data] //! #[repr(C)] //! struct Bar<T> { @@ -71,11 +72,12 @@ //! //! Here is the definition of `Bar` from our example: //! -//! ```rust +//! ```rust,ignore //! # use kernel::init::*; //! #[pin_data] //! #[repr(C)] //! struct Bar<T> { +//! #[pin] //! t: T, //! pub x: usize, //! } @@ -83,7 +85,7 @@ //! //! This expands to the following code: //! -//! ```rust +//! ```rust,ignore //! // Firstly the normal definition of the struct, attributes are preserved: //! #[repr(C)] //! struct Bar<T> { @@ -116,20 +118,22 @@ //! unsafe fn t<E>( //! self, //! slot: *mut T, -//! init: impl ::kernel::init::Init<T, E>, +//! // Since `t` is `#[pin]`, this is `PinInit`. +//! init: impl ::kernel::init::PinInit<T, E>, //! ) -> ::core::result::Result<(), E> { -//! unsafe { ::kernel::init::Init::__init(init, slot) } +//! unsafe { ::kernel::init::PinInit::__pinned_init(init, slot) } //! } //! pub unsafe fn x<E>( //! self, //! slot: *mut usize, +//! // Since `x` is not `#[pin]`, this is `Init`. //! init: impl ::kernel::init::Init<usize, E>, //! ) -> ::core::result::Result<(), E> { //! unsafe { ::kernel::init::Init::__init(init, slot) } //! } //! } //! // Implement the internal `HasPinData` trait that associates `Bar` with the pin-data struct -//! // that we constructed beforehand. +//! // that we constructed above. //! unsafe impl<T> ::kernel::init::__internal::HasPinData for Bar<T> { //! type PinData = __ThePinData<T>; //! unsafe fn __pin_data() -> Self::PinData { @@ -160,6 +164,8 @@ //! struct __Unpin<'__pin, T> { //! __phantom_pin: ::core::marker::PhantomData<fn(&'__pin ()) -> &'__pin ()>, //! __phantom: ::core::marker::PhantomData<fn(Bar<T>) -> Bar<T>>, +//! // Our only `#[pin]` field is `t`. +//! t: T, //! } //! #[doc(hidden)] //! impl<'__pin, T> @@ -193,7 +199,7 @@ //! //! Here is the impl on `Bar` defining the new function: //! -//! ```rust +//! ```rust,ignore //! impl<T> Bar<T> { //! fn new(t: T) -> impl PinInit<Self> { //! pin_init!(Self { t, x: 0 }) @@ -203,7 +209,7 @@ //! //! This expands to the following code: //! -//! ```rust +//! ```rust,ignore //! impl<T> Bar<T> { //! fn new(t: T) -> impl PinInit<Self> { //! { @@ -232,25 +238,31 @@ //! // that will refer to this struct instead of the one defined above. //! struct __InitOk; //! // This is the expansion of `t,`, which is syntactic sugar for `t: t,`. -//! unsafe { ::core::ptr::write(&raw mut (*slot).t, t) }; +//! unsafe { ::core::ptr::write(::core::addr_of_mut!((*slot).t), t) }; //! // Since initialization could fail later (not in this case, since the error -//! // type is `Infallible`) we will need to drop this field if it fails. This -//! // `DropGuard` will drop the field when it gets dropped and has not yet -//! // been forgotten. We make a reference to it, so users cannot `mem::forget` -//! // it from the initializer, since the name is the same as the field. +//! // type is `Infallible`) we will need to drop this field if there is an +//! // error later. This `DropGuard` will drop the field when it gets dropped +//! // and has not yet been forgotten. We make a reference to it, so users +//! // cannot `mem::forget` it from the initializer, since the name is the same +//! // as the field (including hygiene). //! let t = &unsafe { -//! ::kernel::init::__internal::DropGuard::new(&raw mut (*slot).t) +//! ::kernel::init::__internal::DropGuard::new( +//! ::core::addr_of_mut!((*slot).t), +//! ) //! }; //! // Expansion of `x: 0,`: //! // Since this can be an arbitrary expression we cannot place it inside of //! // the `unsafe` block, so we bind it here. //! let x = 0; -//! unsafe { ::core::ptr::write(&raw mut (*slot).x, x) }; +//! unsafe { ::core::ptr::write(::core::addr_of_mut!((*slot).x), x) }; +//! // We again create a `DropGuard`. //! let x = &unsafe { -//! ::kernel::init::__internal::DropGuard::new(&raw mut (*slot).x) +//! ::kernel::init::__internal::DropGuard::new( +//! ::core::addr_of_mut!((*slot).x), +//! ) //! }; //! -//! // Here we use the type checker to ensuer that every field has been +//! // Here we use the type checker to ensure that every field has been //! // initialized exactly once, since this is `if false` it will never get //! // executed, but still type-checked. //! // Additionally we abuse `slot` to automatically infer the correct type for @@ -272,7 +284,7 @@ //! }; //! } //! // Since initialization has successfully completed, we can now forget the -//! // guards. +//! // guards. This is not `mem::forget`, since we only have `&DropGuard`. //! unsafe { ::kernel::init::__internal::DropGuard::forget(t) }; //! unsafe { ::kernel::init::__internal::DropGuard::forget(x) }; //! } @@ -280,7 +292,7 @@ //! // `__InitOk` that we need to return. //! Ok(__InitOk) //! }); -//! // Change the return type of the closure. +//! // Change the return type from `__InitOk` to `()`. //! let init = move |slot| -> ::core::result::Result<(), ::core::convert::Infallible> { //! init(slot).map(|__InitOk| ()) //! }; @@ -299,7 +311,7 @@ //! Since we already took a look at `#[pin_data]` on `Bar`, this section will only explain the //! differences/new things in the expansion of the `Foo` definition: //! -//! ```rust +//! ```rust,ignore //! #[pin_data(PinnedDrop)] //! struct Foo { //! a: usize, @@ -310,7 +322,7 @@ //! //! This expands to the following code: //! -//! ```rust +//! ```rust,ignore //! struct Foo { //! a: usize, //! b: Bar<u32>, @@ -330,8 +342,6 @@ //! unsafe fn b<E>( //! self, //! slot: *mut Bar<u32>, -//! // Note that this is `PinInit` instead of `Init`, this is because `b` is -//! // structurally pinned, as marked by the `#[pin]` attribute. //! init: impl ::kernel::init::PinInit<Bar<u32>, E>, //! ) -> ::core::result::Result<(), E> { //! unsafe { ::kernel::init::PinInit::__pinned_init(init, slot) } @@ -359,14 +369,13 @@ //! struct __Unpin<'__pin> { //! __phantom_pin: ::core::marker::PhantomData<fn(&'__pin ()) -> &'__pin ()>, //! __phantom: ::core::marker::PhantomData<fn(Foo) -> Foo>, -//! // Since this field is `#[pin]`, it is listed here. //! b: Bar<u32>, //! } //! #[doc(hidden)] //! impl<'__pin> ::core::marker::Unpin for Foo where __Unpin<'__pin>: ::core::marker::Unpin {} //! // Since we specified `PinnedDrop` as the argument to `#[pin_data]`, we expect `Foo` to //! // implement `PinnedDrop`. Thus we do not need to prevent `Drop` implementations like -//! // before, instead we implement it here and delegate to `PinnedDrop`. +//! // before, instead we implement `Drop` here and delegate to `PinnedDrop`. //! impl ::core::ops::Drop for Foo { //! fn drop(&mut self) { //! // Since we are getting dropped, no one else has a reference to `self` and thus we @@ -388,7 +397,7 @@ //! //! Here is the `PinnedDrop` impl for `Foo`: //! -//! ```rust +//! ```rust,ignore //! #[pinned_drop] //! impl PinnedDrop for Foo { //! fn drop(self: Pin<&mut Self>) { @@ -399,7 +408,7 @@ //! //! This expands to the following code: //! -//! ```rust +//! ```rust,ignore //! // `unsafe`, full path and the token parameter are added, everything else stays the same. //! unsafe impl ::kernel::init::PinnedDrop for Foo { //! fn drop(self: Pin<&mut Self>, _: ::kernel::init::__internal::OnlyCallFromDrop) { @@ -410,10 +419,10 @@ //! //! ## `pin_init!` on `Foo` //! -//! Since we already took a look at `pin_init!` on `Bar`, this section will only explain the -//! differences/new things in the expansion of `pin_init!` on `Foo`: +//! Since we already took a look at `pin_init!` on `Bar`, this section will only show the expansion +//! of `pin_init!` on `Foo`: //! -//! ```rust +//! ```rust,ignore //! let a = 42; //! let initializer = pin_init!(Foo { //! a, @@ -423,7 +432,7 @@ //! //! This expands to the following code: //! -//! ```rust +//! ```rust,ignore //! let a = 42; //! let initializer = { //! struct __InitOk; @@ -438,13 +447,15 @@ //! >(data, move |slot| { //! { //! struct __InitOk; -//! unsafe { ::core::ptr::write(&raw mut (*slot).a, a) }; -//! let a = &unsafe { ::kernel::init::__internal::DropGuard::new(&raw mut (*slot).a) }; +//! unsafe { ::core::ptr::write(::core::addr_of_mut!((*slot).a), a) }; +//! let a = &unsafe { +//! ::kernel::init::__internal::DropGuard::new(::core::addr_of_mut!((*slot).a)) +//! }; //! let b = Bar::new(36); -//! // Here we use `data` to access the correct field and require that `b` is of type -//! // `PinInit<Bar<u32>, Infallible>`. -//! unsafe { data.b(&raw mut (*slot).b, b)? }; -//! let b = &unsafe { ::kernel::init::__internal::DropGuard::new(&raw mut (*slot).b) }; +//! unsafe { data.b(::core::addr_of_mut!((*slot).b), b)? }; +//! let b = &unsafe { +//! ::kernel::init::__internal::DropGuard::new(::core::addr_of_mut!((*slot).b)) +//! }; //! //! #[allow(unreachable_code, clippy::diverging_sub_expression)] //! if false { diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 676995d4e460..85b261209977 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -14,12 +14,8 @@ #![no_std] #![feature(allocator_api)] #![feature(coerce_unsized)] -#![feature(core_ffi_c)] #![feature(dispatch_from_dyn)] -#![feature(explicit_generic_args_with_impl_trait)] -#![feature(generic_associated_types)] #![feature(new_uninit)] -#![feature(pin_macro)] #![feature(receiver_trait)] #![feature(unsize)] diff --git a/rust/kernel/std_vendor.rs b/rust/kernel/std_vendor.rs index b3e68b24a8c6..388d6a5147a2 100644 --- a/rust/kernel/std_vendor.rs +++ b/rust/kernel/std_vendor.rs @@ -137,6 +137,8 @@ /// [`std::dbg`]: https://doc.rust-lang.org/std/macro.dbg.html /// [`eprintln`]: https://doc.rust-lang.org/std/macro.eprintln.html /// [`printk`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html +/// [`pr_info`]: crate::pr_info! +/// [`pr_debug`]: crate::pr_debug! #[macro_export] macro_rules! dbg { // NOTE: We cannot use `concat!` to make a static string as a format argument diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index cd3d2a6cf1fc..c9dd3bf59e34 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -2,6 +2,7 @@ //! String representations. +use alloc::alloc::AllocError; use alloc::vec::Vec; use core::fmt::{self, Write}; use core::ops::{self, Deref, Index}; @@ -199,6 +200,12 @@ impl CStr { pub unsafe fn as_str_unchecked(&self) -> &str { unsafe { core::str::from_utf8_unchecked(self.as_bytes()) } } + + /// Convert this [`CStr`] into a [`CString`] by allocating memory and + /// copying over the string data. + pub fn to_cstring(&self) -> Result<CString, AllocError> { + CString::try_from(self) + } } impl fmt::Display for CStr { @@ -584,6 +591,21 @@ impl Deref for CString { } } +impl<'a> TryFrom<&'a CStr> for CString { + type Error = AllocError; + + fn try_from(cstr: &'a CStr) -> Result<CString, AllocError> { + let mut buf = Vec::new(); + + buf.try_extend_from_slice(cstr.as_bytes_with_nul()) + .map_err(|_| AllocError)?; + + // INVARIANT: The `CStr` and `CString` types have the same invariants for + // the string data, and we copied it over without changes. + Ok(CString { buf }) + } +} + /// A convenience alias for [`core::format_args`]. #[macro_export] macro_rules! fmt { diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs index e6d206242465..a89843cacaad 100644 --- a/rust/kernel/sync/arc.rs +++ b/rust/kernel/sync/arc.rs @@ -146,13 +146,15 @@ impl<T: ?Sized + Unsize<U>, U: ?Sized> core::ops::DispatchFromDyn<Arc<U>> for Ar // SAFETY: It is safe to send `Arc<T>` to another thread when the underlying `T` is `Sync` because // it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, it needs -// `T` to be `Send` because any thread that has an `Arc<T>` may ultimately access `T` directly, for -// example, when the reference count reaches zero and `T` is dropped. +// `T` to be `Send` because any thread that has an `Arc<T>` may ultimately access `T` using a +// mutable reference when the reference count reaches zero and `T` is dropped. unsafe impl<T: ?Sized + Sync + Send> Send for Arc<T> {} -// SAFETY: It is safe to send `&Arc<T>` to another thread when the underlying `T` is `Sync` for the -// same reason as above. `T` needs to be `Send` as well because a thread can clone an `&Arc<T>` -// into an `Arc<T>`, which may lead to `T` being accessed by the same reasoning as above. +// SAFETY: It is safe to send `&Arc<T>` to another thread when the underlying `T` is `Sync` +// because it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, +// it needs `T` to be `Send` because any thread that has a `&Arc<T>` may clone it and get an +// `Arc<T>` on that thread, so the thread may ultimately access `T` using a mutable reference when +// the reference count reaches zero and `T` is dropped. unsafe impl<T: ?Sized + Sync + Send> Sync for Arc<T> {} impl<T> Arc<T> { @@ -185,7 +187,7 @@ impl<T> Arc<T> { /// Use the given initializer to in-place initialize a `T`. /// - /// This is equivalent to [`pin_init`], since an [`Arc`] is always pinned. + /// This is equivalent to [`Arc<T>::pin_init`], since an [`Arc`] is always pinned. #[inline] pub fn init<E>(init: impl Init<T, E>) -> error::Result<Self> where @@ -221,6 +223,11 @@ impl<T: ?Sized> Arc<T> { // reference can be created. unsafe { ArcBorrow::new(self.ptr) } } + + /// Compare whether two [`Arc`] pointers reference the same underlying object. + pub fn ptr_eq(this: &Self, other: &Self) -> bool { + core::ptr::eq(this.ptr.as_ptr(), other.ptr.as_ptr()) + } } impl<T: 'static> ForeignOwnable for Arc<T> { @@ -259,6 +266,12 @@ impl<T: ?Sized> Deref for Arc<T> { } } +impl<T: ?Sized> AsRef<T> for Arc<T> { + fn as_ref(&self) -> &T { + self.deref() + } +} + impl<T: ?Sized> Clone for Arc<T> { fn clone(&self) -> Self { // INVARIANT: C `refcount_inc` saturates the refcount, so it cannot overflow to zero. diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 526d29a0ae27..7eda15e5f1b3 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -64,8 +64,14 @@ macro_rules! current { #[repr(transparent)] pub struct Task(pub(crate) Opaque<bindings::task_struct>); -// SAFETY: It's OK to access `Task` through references from other threads because we're either -// accessing properties that don't change (e.g., `pid`, `group_leader`) or that are properly +// SAFETY: By design, the only way to access a `Task` is via the `current` function or via an +// `ARef<Task>` obtained through the `AlwaysRefCounted` impl. This means that the only situation in +// which a `Task` can be accessed mutably is when the refcount drops to zero and the destructor +// runs. It is safe for that to happen on any thread, so it is ok for this type to be `Send`. +unsafe impl Send for Task {} + +// SAFETY: It's OK to access `Task` through shared references from other threads because we're +// either accessing properties that don't change (e.g., `pid`, `group_leader`) or that are properly // synchronised by C code (e.g., `signal_pending`). unsafe impl Sync for Task {} diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs index 29db59d6119a..1e5380b16ed5 100644 --- a/rust/kernel/types.rs +++ b/rust/kernel/types.rs @@ -321,6 +321,19 @@ pub struct ARef<T: AlwaysRefCounted> { _p: PhantomData<T>, } +// SAFETY: It is safe to send `ARef<T>` to another thread when the underlying `T` is `Sync` because +// it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, it needs +// `T` to be `Send` because any thread that has an `ARef<T>` may ultimately access `T` using a +// mutable reference, for example, when the reference count reaches zero and `T` is dropped. +unsafe impl<T: AlwaysRefCounted + Sync + Send> Send for ARef<T> {} + +// SAFETY: It is safe to send `&ARef<T>` to another thread when the underlying `T` is `Sync` +// because it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, +// it needs `T` to be `Send` because any thread that has a `&ARef<T>` may clone it and get an +// `ARef<T>` on that thread, so the thread may ultimately access `T` using a mutable reference, for +// example, when the reference count reaches zero and `T` is dropped. +unsafe impl<T: AlwaysRefCounted + Sync + Send> Sync for ARef<T> {} + impl<T: AlwaysRefCounted> ARef<T> { /// Creates a new instance of [`ARef`]. /// diff --git a/rust/macros/helpers.rs b/rust/macros/helpers.rs index b2bdd4d8c958..afb0f2e3a36a 100644 --- a/rust/macros/helpers.rs +++ b/rust/macros/helpers.rs @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -use proc_macro::{token_stream, Group, TokenTree}; +use proc_macro::{token_stream, Group, Punct, Spacing, TokenStream, TokenTree}; pub(crate) fn try_ident(it: &mut token_stream::IntoIter) -> Option<String> { if let Some(TokenTree::Ident(ident)) = it.next() { @@ -69,3 +69,87 @@ pub(crate) fn expect_end(it: &mut token_stream::IntoIter) { panic!("Expected end"); } } + +pub(crate) struct Generics { + pub(crate) impl_generics: Vec<TokenTree>, + pub(crate) ty_generics: Vec<TokenTree>, +} + +/// Parses the given `TokenStream` into `Generics` and the rest. +/// +/// The generics are not present in the rest, but a where clause might remain. +pub(crate) fn parse_generics(input: TokenStream) -> (Generics, Vec<TokenTree>) { + // `impl_generics`, the declared generics with their bounds. + let mut impl_generics = vec![]; + // Only the names of the generics, without any bounds. + let mut ty_generics = vec![]; + // Tokens not related to the generics e.g. the `where` token and definition. + let mut rest = vec![]; + // The current level of `<`. + let mut nesting = 0; + let mut toks = input.into_iter(); + // If we are at the beginning of a generic parameter. + let mut at_start = true; + for tt in &mut toks { + match tt.clone() { + TokenTree::Punct(p) if p.as_char() == '<' => { + if nesting >= 1 { + // This is inside of the generics and part of some bound. + impl_generics.push(tt); + } + nesting += 1; + } + TokenTree::Punct(p) if p.as_char() == '>' => { + // This is a parsing error, so we just end it here. + if nesting == 0 { + break; + } else { + nesting -= 1; + if nesting >= 1 { + // We are still inside of the generics and part of some bound. + impl_generics.push(tt); + } + if nesting == 0 { + break; + } + } + } + tt => { + if nesting == 1 { + // Here depending on the token, it might be a generic variable name. + match &tt { + // Ignore const. + TokenTree::Ident(i) if i.to_string() == "const" => {} + TokenTree::Ident(_) if at_start => { + ty_generics.push(tt.clone()); + // We also already push the `,` token, this makes it easier to append + // generics. + ty_generics.push(TokenTree::Punct(Punct::new(',', Spacing::Alone))); + at_start = false; + } + TokenTree::Punct(p) if p.as_char() == ',' => at_start = true, + // Lifetimes begin with `'`. + TokenTree::Punct(p) if p.as_char() == '\'' && at_start => { + ty_generics.push(tt.clone()); + } + _ => {} + } + } + if nesting >= 1 { + impl_generics.push(tt); + } else if nesting == 0 { + // If we haven't entered the generics yet, we still want to keep these tokens. + rest.push(tt); + } + } + } + } + rest.extend(toks); + ( + Generics { + impl_generics, + ty_generics, + }, + rest, + ) +} diff --git a/rust/macros/pin_data.rs b/rust/macros/pin_data.rs index 954149d77181..6d58cfda9872 100644 --- a/rust/macros/pin_data.rs +++ b/rust/macros/pin_data.rs @@ -1,79 +1,127 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT -use proc_macro::{Punct, Spacing, TokenStream, TokenTree}; +use crate::helpers::{parse_generics, Generics}; +use proc_macro::{Group, Punct, Spacing, TokenStream, TokenTree}; pub(crate) fn pin_data(args: TokenStream, input: TokenStream) -> TokenStream { // This proc-macro only does some pre-parsing and then delegates the actual parsing to // `kernel::__pin_data!`. - // - // In here we only collect the generics, since parsing them in declarative macros is very - // elaborate. We also do not need to analyse their structure, we only need to collect them. - // `impl_generics`, the declared generics with their bounds. - let mut impl_generics = vec![]; - // Only the names of the generics, without any bounds. - let mut ty_generics = vec![]; - // Tokens not related to the generics e.g. the `impl` token. - let mut rest = vec![]; - // The current level of `<`. - let mut nesting = 0; - let mut toks = input.into_iter(); - // If we are at the beginning of a generic parameter. - let mut at_start = true; - for tt in &mut toks { - match tt.clone() { - TokenTree::Punct(p) if p.as_char() == '<' => { - if nesting >= 1 { - impl_generics.push(tt); - } - nesting += 1; - } - TokenTree::Punct(p) if p.as_char() == '>' => { - if nesting == 0 { - break; - } else { - nesting -= 1; - if nesting >= 1 { - impl_generics.push(tt); - } - if nesting == 0 { - break; - } + let ( + Generics { + impl_generics, + ty_generics, + }, + rest, + ) = parse_generics(input); + // The struct definition might contain the `Self` type. Since `__pin_data!` will define a new + // type with the same generics and bounds, this poses a problem, since `Self` will refer to the + // new type as opposed to this struct definition. Therefore we have to replace `Self` with the + // concrete name. + + // Errors that occur when replacing `Self` with `struct_name`. + let mut errs = TokenStream::new(); + // The name of the struct with ty_generics. + let struct_name = rest + .iter() + .skip_while(|tt| !matches!(tt, TokenTree::Ident(i) if i.to_string() == "struct")) + .nth(1) + .and_then(|tt| match tt { + TokenTree::Ident(_) => { + let tt = tt.clone(); + let mut res = vec![tt]; + if !ty_generics.is_empty() { + // We add this, so it is maximally compatible with e.g. `Self::CONST` which + // will be replaced by `StructName::<$generics>::CONST`. + res.push(TokenTree::Punct(Punct::new(':', Spacing::Joint))); + res.push(TokenTree::Punct(Punct::new(':', Spacing::Alone))); + res.push(TokenTree::Punct(Punct::new('<', Spacing::Alone))); + res.extend(ty_generics.iter().cloned()); + res.push(TokenTree::Punct(Punct::new('>', Spacing::Alone))); } + Some(res) } - tt => { - if nesting == 1 { - match &tt { - TokenTree::Ident(i) if i.to_string() == "const" => {} - TokenTree::Ident(_) if at_start => { - ty_generics.push(tt.clone()); - ty_generics.push(TokenTree::Punct(Punct::new(',', Spacing::Alone))); - at_start = false; - } - TokenTree::Punct(p) if p.as_char() == ',' => at_start = true, - TokenTree::Punct(p) if p.as_char() == '\'' && at_start => { - ty_generics.push(tt.clone()); - } - _ => {} - } - } - if nesting >= 1 { - impl_generics.push(tt); - } else if nesting == 0 { - rest.push(tt); - } + _ => None, + }) + .unwrap_or_else(|| { + // If we did not find the name of the struct then we will use `Self` as the replacement + // and add a compile error to ensure it does not compile. + errs.extend( + "::core::compile_error!(\"Could not locate type name.\");" + .parse::<TokenStream>() + .unwrap(), + ); + "Self".parse::<TokenStream>().unwrap().into_iter().collect() + }); + let impl_generics = impl_generics + .into_iter() + .flat_map(|tt| replace_self_and_deny_type_defs(&struct_name, tt, &mut errs)) + .collect::<Vec<_>>(); + let mut rest = rest + .into_iter() + .flat_map(|tt| { + // We ignore top level `struct` tokens, since they would emit a compile error. + if matches!(&tt, TokenTree::Ident(i) if i.to_string() == "struct") { + vec![tt] + } else { + replace_self_and_deny_type_defs(&struct_name, tt, &mut errs) } - } - } - rest.extend(toks); + }) + .collect::<Vec<_>>(); // This should be the body of the struct `{...}`. let last = rest.pop(); - quote!(::kernel::__pin_data! { + let mut quoted = quote!(::kernel::__pin_data! { parse_input: @args(#args), @sig(#(#rest)*), @impl_generics(#(#impl_generics)*), @ty_generics(#(#ty_generics)*), @body(#last), - }) + }); + quoted.extend(errs); + quoted +} + +/// Replaces `Self` with `struct_name` and errors on `enum`, `trait`, `struct` `union` and `impl` +/// keywords. +/// +/// The error is appended to `errs` to allow normal parsing to continue. +fn replace_self_and_deny_type_defs( + struct_name: &Vec<TokenTree>, + tt: TokenTree, + errs: &mut TokenStream, +) -> Vec<TokenTree> { + match tt { + TokenTree::Ident(ref i) + if i.to_string() == "enum" + || i.to_string() == "trait" + || i.to_string() == "struct" + || i.to_string() == "union" + || i.to_string() == "impl" => + { + errs.extend( + format!( + "::core::compile_error!(\"Cannot use `{i}` inside of struct definition with \ + `#[pin_data]`.\");" + ) + .parse::<TokenStream>() + .unwrap() + .into_iter() + .map(|mut tok| { + tok.set_span(tt.span()); + tok + }), + ); + vec![tt] + } + TokenTree::Ident(i) if i.to_string() == "Self" => struct_name.clone(), + TokenTree::Literal(_) | TokenTree::Punct(_) | TokenTree::Ident(_) => vec![tt], + TokenTree::Group(g) => vec![TokenTree::Group(Group::new( + g.delimiter(), + g.stream() + .into_iter() + .flat_map(|tt| replace_self_and_deny_type_defs(struct_name, tt, errs)) + .collect(), + ))], + } } diff --git a/rust/macros/quote.rs b/rust/macros/quote.rs index c8e08b3c1e4c..dddbb4e6f4cb 100644 --- a/rust/macros/quote.rs +++ b/rust/macros/quote.rs @@ -39,12 +39,14 @@ impl ToTokens for TokenStream { /// [`quote_spanned!`](https://docs.rs/quote/latest/quote/macro.quote_spanned.html) macro from the /// `quote` crate but provides only just enough functionality needed by the current `macros` crate. macro_rules! quote_spanned { - ($span:expr => $($tt:tt)*) => { - #[allow(clippy::vec_init_then_push)] - { - let mut tokens = ::std::vec::Vec::new(); - let span = $span; - quote_spanned!(@proc tokens span $($tt)*); + ($span:expr => $($tt:tt)*) => {{ + let mut tokens; + #[allow(clippy::vec_init_then_push)] + { + tokens = ::std::vec::Vec::new(); + let span = $span; + quote_spanned!(@proc tokens span $($tt)*); + } ::proc_macro::TokenStream::from_iter(tokens) }}; (@proc $v:ident $span:ident) => {}; diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs index 29f69f3a52de..0caad902ba40 100644 --- a/rust/uapi/lib.rs +++ b/rust/uapi/lib.rs @@ -8,7 +8,6 @@ //! userspace APIs. #![no_std] -#![feature(core_ffi_c)] // See <https://github.com/rust-lang/rust-bindgen/issues/1651>. #![cfg_attr(test, allow(deref_nullptr))] #![cfg_attr(test, allow(unaligned_references))] diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 9f94fc83f086..78175231c969 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -277,7 +277,7 @@ $(obj)/%.lst: $(src)/%.c FORCE # Compile Rust sources (.rs) # --------------------------------------------------------------------------- -rust_allowed_features := core_ffi_c,explicit_generic_args_with_impl_trait,new_uninit,pin_macro +rust_allowed_features := new_uninit rust_common_cmd = \ RUST_MODFILE=$(modfile) $(RUSTC_OR_CLIPPY) $(rust_flags) \ diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 471300ba176c..50a92c4e9984 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -48,12 +48,12 @@ if IS_BUILTIN(CONFIG_COMMON_CLK): LX_GDBPARSED(CLK_GET_RATE_NOCACHE) /* linux/fs.h */ -LX_VALUE(SB_RDONLY) -LX_VALUE(SB_SYNCHRONOUS) -LX_VALUE(SB_MANDLOCK) -LX_VALUE(SB_DIRSYNC) -LX_VALUE(SB_NOATIME) -LX_VALUE(SB_NODIRATIME) +LX_GDBPARSED(SB_RDONLY) +LX_GDBPARSED(SB_SYNCHRONOUS) +LX_GDBPARSED(SB_MANDLOCK) +LX_GDBPARSED(SB_DIRSYNC) +LX_GDBPARSED(SB_NOATIME) +LX_GDBPARSED(SB_NODIRATIME) /* linux/htimer.h */ LX_GDBPARSED(hrtimer_resolution) diff --git a/scripts/gfp-translate b/scripts/gfp-translate index b2ce416d944b..6c9aed17cf56 100755 --- a/scripts/gfp-translate +++ b/scripts/gfp-translate @@ -63,11 +63,11 @@ fi # Extract GFP flags from the kernel source TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1 -grep -q ___GFP $SOURCE/include/linux/gfp.h +grep -q ___GFP $SOURCE/include/linux/gfp_types.h if [ $? -eq 0 ]; then - grep "^#define ___GFP" $SOURCE/include/linux/gfp.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE + grep "^#define ___GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE else - grep "^#define __GFP" $SOURCE/include/linux/gfp.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE + grep "^#define __GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE fi # Parse the flags diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index 20d483ec6f5f..131be76d2130 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -27,7 +27,7 @@ llvm) fi ;; rustc) - echo 1.62.0 + echo 1.68.2 ;; bindgen) echo 0.56.0 diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index d4531d09984d..c12150f96b88 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1979,6 +1979,11 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "#include <linux/vermagic.h>\n"); buf_printf(b, "#include <linux/compiler.h>\n"); buf_printf(b, "\n"); + buf_printf(b, "#ifdef CONFIG_UNWINDER_ORC\n"); + buf_printf(b, "#include <asm/orc_header.h>\n"); + buf_printf(b, "ORC_HEADER;\n"); + buf_printf(b, "#endif\n"); + buf_printf(b, "\n"); buf_printf(b, "BUILD_SALT;\n"); buf_printf(b, "BUILD_LTO_INFO;\n"); buf_printf(b, "\n"); diff --git a/scripts/orc_hash.sh b/scripts/orc_hash.sh new file mode 100644 index 000000000000..466611aa0053 --- /dev/null +++ b/scripts/orc_hash.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (c) Meta Platforms, Inc. and affiliates. + +set -e + +printf '%s' '#define ORC_HASH ' + +awk ' +/^#define ORC_(REG|TYPE)_/ { print } +/^struct orc_entry {$/ { p=1 } +p { print } +/^}/ { p=0 }' | + sha1sum | + cut -d " " -f 1 | + sed 's/\([0-9a-f]\{2\}\)/0x\1,/g' diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index d3662f4acadc..ce541b0ee1d3 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -202,19 +202,19 @@ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, allowed_algos); } -static int ima_get_verity_digest(struct integrity_iint_cache *iint, - struct ima_max_digest_data *hash) +static bool ima_get_verity_digest(struct integrity_iint_cache *iint, + struct ima_max_digest_data *hash) { - enum hash_algo verity_alg; - int ret; + enum hash_algo alg; + int digest_len; /* * On failure, 'measure' policy rules will result in a file data * hash containing 0's. */ - ret = fsverity_get_digest(iint->inode, hash->digest, &verity_alg); - if (ret) - return ret; + digest_len = fsverity_get_digest(iint->inode, hash->digest, NULL, &alg); + if (digest_len == 0) + return false; /* * Unlike in the case of actually calculating the file hash, in @@ -223,9 +223,9 @@ static int ima_get_verity_digest(struct integrity_iint_cache *iint, * mismatch between the verity algorithm and the xattr signature * algorithm, if one exists, will be detected later. */ - hash->hdr.algo = verity_alg; - hash->hdr.length = hash_digest_size[verity_alg]; - return 0; + hash->hdr.algo = alg; + hash->hdr.length = digest_len; + return true; } /* @@ -276,16 +276,9 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, memset(&hash.digest, 0, sizeof(hash.digest)); if (iint->flags & IMA_VERITY_REQUIRED) { - result = ima_get_verity_digest(iint, &hash); - switch (result) { - case 0: - break; - case -ENODATA: + if (!ima_get_verity_digest(iint, &hash)) { audit_cause = "no-verity-digest"; - break; - default: - audit_cause = "invalid-verity-digest"; - break; + result = -ENODATA; } } else if (buf) { result = ima_calc_buffer_hash(buf, size, &hash.hdr); diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 308ec7034cc9..dabfdecece26 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9527,6 +9527,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK), SND_PCI_QUIRK(0x1043, 0x1473, "ASUS GU604V", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1483, "ASUS GU603V", ALC285_FIXUP_ASUS_HEADSET_MIC), + SND_PCI_QUIRK(0x1043, 0x1493, "ASUS GV601V", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1517, "Asus Zenbook UX31A", ALC269VB_FIXUP_ASUS_ZENBOOK_UX31A), SND_PCI_QUIRK(0x1043, 0x1662, "ASUS GV301QH", ALC294_FIXUP_ASUS_DUAL_SPK), SND_PCI_QUIRK(0x1043, 0x1683, "ASUS UM3402YAR", ALC287_FIXUP_CS35L41_I2C_2), @@ -9552,6 +9553,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1c23, "Asus X55U", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), + SND_PCI_QUIRK(0x1043, 0x1caf, "ASUS G634JYR/JZR", ALC285_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x1ccd, "ASUS X555UB", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x1d42, "ASUS Zephyrus G14 2022", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1d4e, "ASUS TM420", ALC256_FIXUP_ASUS_HPE), diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index 6faf4a43eaf5..144f082c63fd 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -1347,7 +1347,7 @@ static int sof_card_dai_links_create(struct device *dev, if ((SDW_PART_ID(adr_link->adr_d[i].adr) != SDW_PART_ID(adr_link->adr_d[j].adr)) || (SDW_MFG_ID(adr_link->adr_d[i].adr) != - SDW_MFG_ID(adr_link->adr_d[i].adr))) { + SDW_MFG_ID(adr_link->adr_d[j].adr))) { append_codec_type = true; goto out; } diff --git a/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c b/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c new file mode 100644 index 000000000000..3afd9f775f68 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include "test_subprogs_extable.skel.h" + +void test_subprogs_extable(void) +{ + const int read_sz = 456; + struct test_subprogs_extable *skel; + int err; + + skel = test_subprogs_extable__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = test_subprogs_extable__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger tracepoint */ + ASSERT_OK(trigger_module_test_read(read_sz), "trigger_read"); + + ASSERT_NEQ(skel->bss->triggered, 0, "verify at least one program ran"); + + test_subprogs_extable__detach(skel); + +cleanup: + test_subprogs_extable__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_subprogs_extable.c b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c new file mode 100644 index 000000000000..e2a21fbd4e44 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 8); + __type(key, __u32); + __type(value, __u64); +} test_array SEC(".maps"); + +unsigned int triggered; + +static __u64 test_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data) +{ + return 1; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs2, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs3, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 136e5530b72c..6115520154e3 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -371,4 +371,83 @@ __naked void and_then_at_fp_8(void) " ::: __clobber_all); } +SEC("xdp") +__description("32-bit spill of 64-bit reg should clear ID") +__failure __msg("math between ctx pointer and 4294967295 is not allowed") +__naked void spill_32bit_of_64bit_fail(void) +{ + asm volatile (" \ + r6 = r1; \ + /* Roll one bit to force the verifier to track both branches. */\ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x8; \ + /* Put a large number into r1. */ \ + r1 = 0xffffffff; \ + r1 <<= 32; \ + r1 += r0; \ + /* Assign an ID to r1. */ \ + r2 = r1; \ + /* 32-bit spill r1 to stack - should clear the ID! */\ + *(u32*)(r10 - 8) = r1; \ + /* 32-bit fill r2 from stack. */ \ + r2 = *(u32*)(r10 - 8); \ + /* Compare r2 with another register to trigger find_equal_scalars.\ + * Having one random bit is important here, otherwise the verifier cuts\ + * the corners. If the ID was mistakenly preserved on spill, this would\ + * cause the verifier to think that r1 is also equal to zero in one of\ + * the branches, and equal to eight on the other branch.\ + */ \ + r3 = 0; \ + if r2 != r3 goto l0_%=; \ +l0_%=: r1 >>= 32; \ + /* At this point, if the verifier thinks that r1 is 0, an out-of-bounds\ + * read will happen, because it actually contains 0xffffffff.\ + */ \ + r6 += r1; \ + r0 = *(u32*)(r6 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("xdp") +__description("16-bit spill of 32-bit reg should clear ID") +__failure __msg("dereference of modified ctx ptr R6 off=65535 disallowed") +__naked void spill_16bit_of_32bit_fail(void) +{ + asm volatile (" \ + r6 = r1; \ + /* Roll one bit to force the verifier to track both branches. */\ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x8; \ + /* Put a large number into r1. */ \ + w1 = 0xffff0000; \ + r1 += r0; \ + /* Assign an ID to r1. */ \ + r2 = r1; \ + /* 16-bit spill r1 to stack - should clear the ID! */\ + *(u16*)(r10 - 8) = r1; \ + /* 16-bit fill r2 from stack. */ \ + r2 = *(u16*)(r10 - 8); \ + /* Compare r2 with another register to trigger find_equal_scalars.\ + * Having one random bit is important here, otherwise the verifier cuts\ + * the corners. If the ID was mistakenly preserved on spill, this would\ + * cause the verifier to think that r1 is also equal to zero in one of\ + * the branches, and equal to eight on the other branch.\ + */ \ + r3 = 0; \ + if r2 != r3 goto l0_%=; \ +l0_%=: r1 >>= 16; \ + /* At this point, if the verifier thinks that r1 is 0, an out-of-bounds\ + * read will happen, because it actually contains 0xffff.\ + */ \ + r6 += r1; \ + r0 = *(u32*)(r6 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index d8bff2005dfc..5fd49ad0c696 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -249,7 +249,7 @@ /** * FIXTURE_SETUP() - Prepares the setup function for the fixture. - * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. + * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly. * * @fixture_name: fixture name * @@ -275,7 +275,7 @@ /** * FIXTURE_TEARDOWN() - * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. + * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly. * * @fixture_name: fixture name * @@ -388,7 +388,7 @@ if (setjmp(_metadata->env) == 0) { \ fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ - if (!_metadata->passed) \ + if (!_metadata->passed || _metadata->skip) \ return; \ _metadata->setup_completed = true; \ fixture_name##_##test_name(_metadata, &self, variant->data); \ diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 23af4633f0f4..4f0c50c33ba7 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -5,12 +5,15 @@ LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h include local_config.mk +ifeq ($(ARCH),) + ifeq ($(CROSS_COMPILE),) uname_M := $(shell uname -m 2>/dev/null || echo not) else uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') endif -MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +endif # Without this, failed build products remain, with up-to-date timestamps, # thus tricking Make (and you!) into believing that All Is Well, in subsequent @@ -65,7 +68,7 @@ TEST_GEN_PROGS += ksm_tests TEST_GEN_PROGS += ksm_functional_tests TEST_GEN_PROGS += mdwe_test -ifeq ($(MACHINE),x86_64) +ifeq ($(ARCH),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) @@ -87,13 +90,13 @@ TEST_GEN_PROGS += $(BINARIES_64) endif else -ifneq (,$(findstring $(MACHINE),ppc64)) +ifneq (,$(findstring $(ARCH),ppc64)) TEST_GEN_PROGS += protection_keys endif endif -ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) TEST_GEN_PROGS += va_high_addr_switch TEST_GEN_PROGS += virtual_address_range TEST_GEN_PROGS += write_to_hugetlbfs @@ -112,7 +115,7 @@ $(TEST_GEN_PROGS): vm_util.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c -ifeq ($(MACHINE),x86_64) +ifeq ($(ARCH),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 21ca91473c09..ee6880ac3e5e 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -92,6 +92,13 @@ NSC_CMD="ip netns exec ${NSC}" which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) +# Check if FIPS mode is enabled +if [ -f /proc/sys/crypto/fips_enabled ]; then + fips_enabled=`cat /proc/sys/crypto/fips_enabled` +else + fips_enabled=0 +fi + ################################################################################ # utilities @@ -1216,7 +1223,7 @@ ipv4_tcp_novrf() run_cmd nettest -d ${NSA_DEV} -r ${a} log_test_addr ${a} $? 1 "No server, device client, local conn" - ipv4_tcp_md5_novrf + [ "$fips_enabled" = "1" ] || ipv4_tcp_md5_novrf } ipv4_tcp_vrf() @@ -1270,9 +1277,11 @@ ipv4_tcp_vrf() log_test_addr ${a} $? 1 "Global server, local connection" # run MD5 tests - setup_vrf_dup - ipv4_tcp_md5 - cleanup_vrf_dup + if [ "$fips_enabled" = "0" ]; then + setup_vrf_dup + ipv4_tcp_md5 + cleanup_vrf_dup + fi # # enable VRF global server @@ -2772,7 +2781,7 @@ ipv6_tcp_novrf() log_test_addr ${a} $? 1 "No server, device client, local conn" done - ipv6_tcp_md5_novrf + [ "$fips_enabled" = "1" ] || ipv6_tcp_md5_novrf } ipv6_tcp_vrf() @@ -2842,9 +2851,11 @@ ipv6_tcp_vrf() log_test_addr ${a} $? 1 "Global server, local connection" # run MD5 tests - setup_vrf_dup - ipv6_tcp_md5 - cleanup_vrf_dup + if [ "$fips_enabled" = "0" ]; then + setup_vrf_dup + ipv6_tcp_md5 + cleanup_vrf_dup + fi # # enable VRF global server diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh index c5095da7f6bf..aec752a22e9e 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh @@ -93,12 +93,16 @@ cleanup() test_gretap() { + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" } test_ip6gretap() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap" full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap" } diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh index 9ff22f28032d..0cf4c47a46f9 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh @@ -90,12 +90,16 @@ cleanup() test_gretap() { + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br1 full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" } test_ip6gretap() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br1 full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap" full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap" } diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index e699548d4247..ff36844d14b4 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -25,6 +25,8 @@ #define TLS_PAYLOAD_MAX_LEN 16384 #define SOL_TLS 282 +static int fips_enabled; + struct tls_crypto_info_keys { union { struct tls12_crypto_info_aes_gcm_128 aes128; @@ -235,7 +237,7 @@ FIXTURE_VARIANT(tls) { uint16_t tls_version; uint16_t cipher_type; - bool nopad; + bool nopad, fips_non_compliant; }; FIXTURE_VARIANT_ADD(tls, 12_aes_gcm) @@ -254,24 +256,28 @@ FIXTURE_VARIANT_ADD(tls, 12_chacha) { .tls_version = TLS_1_2_VERSION, .cipher_type = TLS_CIPHER_CHACHA20_POLY1305, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_chacha) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_CHACHA20_POLY1305, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_sm4_gcm) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_SM4_GCM, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_sm4_ccm) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_SM4_CCM, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 12_aes_ccm) @@ -311,6 +317,9 @@ FIXTURE_SETUP(tls) int one = 1; int ret; + if (fips_enabled && variant->fips_non_compliant) + SKIP(return, "Unsupported cipher in FIPS mode"); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12); @@ -1865,4 +1874,17 @@ TEST(prequeue) { close(cfd); } +static void __attribute__((constructor)) fips_check(void) { + int res; + FILE *f; + + f = fopen("/proc/sys/crypto/fips_enabled", "r"); + if (f) { + res = fscanf(f, "%d", &fips_enabled); + if (res != 1) + ksft_print_msg("ERROR: Couldn't read /proc/sys/crypto/fips_enabled\n"); + fclose(f); + } +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh index 184da81f554f..452638ae8aed 100755 --- a/tools/testing/selftests/net/vrf-xfrm-tests.sh +++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh @@ -264,60 +264,60 @@ setup_xfrm() ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_4} dst ${h2_4} ${devarg} ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_4} dst ${h2_4} ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_4} dst ${h1_4} ${devarg} ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_4} dst ${h1_4} ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_6} dst ${h2_6} ${devarg} ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_6} dst ${h2_6} ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_6} dst ${h1_6} ${devarg} ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_6} dst ${h1_6} } diff --git a/tools/testing/selftests/user_events/dyn_test.c b/tools/testing/selftests/user_events/dyn_test.c index 8879a7b04c6a..d6979a48478f 100644 --- a/tools/testing/selftests/user_events/dyn_test.c +++ b/tools/testing/selftests/user_events/dyn_test.c @@ -16,42 +16,140 @@ #include "../kselftest_harness.h" -const char *dyn_file = "/sys/kernel/tracing/dynamic_events"; -const char *clear = "!u:__test_event"; +const char *abi_file = "/sys/kernel/tracing/user_events_data"; +const char *enable_file = "/sys/kernel/tracing/events/user_events/__test_event/enable"; -static int Append(const char *value) +static bool wait_for_delete(void) { - int fd = open(dyn_file, O_RDWR | O_APPEND); - int ret = write(fd, value, strlen(value)); + int i; + + for (i = 0; i < 1000; ++i) { + int fd = open(enable_file, O_RDONLY); + + if (fd == -1) + return true; + + close(fd); + usleep(1000); + } + + return false; +} + +static int reg_event(int fd, int *check, int bit, const char *value) +{ + struct user_reg reg = {0}; + + reg.size = sizeof(reg); + reg.name_args = (__u64)value; + reg.enable_bit = bit; + reg.enable_addr = (__u64)check; + reg.enable_size = sizeof(*check); + + if (ioctl(fd, DIAG_IOCSREG, ®) == -1) + return -1; + + return 0; +} + +static int unreg_event(int fd, int *check, int bit) +{ + struct user_unreg unreg = {0}; + + unreg.size = sizeof(unreg); + unreg.disable_bit = bit; + unreg.disable_addr = (__u64)check; + + return ioctl(fd, DIAG_IOCSUNREG, &unreg); +} + +static int parse(int *check, const char *value) +{ + int fd = open(abi_file, O_RDWR); + int ret; + + if (fd == -1) + return -1; + + /* Until we have persist flags via dynamic events, use the base name */ + if (value[0] != 'u' || value[1] != ':') { + close(fd); + return -1; + } + + ret = reg_event(fd, check, 31, value + 2); + + if (ret != -1) { + if (unreg_event(fd, check, 31) == -1) + printf("WARN: Couldn't unreg event\n"); + } close(fd); + return ret; } -#define CLEAR() \ +static int check_match(int *check, const char *first, const char *second, bool *match) +{ + int fd = open(abi_file, O_RDWR); + int ret = -1; + + if (fd == -1) + return -1; + + if (reg_event(fd, check, 31, first) == -1) + goto cleanup; + + if (reg_event(fd, check, 30, second) == -1) { + if (errno == EADDRINUSE) { + /* Name is in use, with different fields */ + *match = false; + ret = 0; + } + + goto cleanup; + } + + *match = true; + ret = 0; +cleanup: + unreg_event(fd, check, 31); + unreg_event(fd, check, 30); + + close(fd); + + wait_for_delete(); + + return ret; +} + +#define TEST_MATCH(x, y) \ do { \ - int ret = Append(clear); \ - if (ret == -1) \ - ASSERT_EQ(ENOENT, errno); \ + bool match; \ + ASSERT_NE(-1, check_match(&self->check, x, y, &match)); \ + ASSERT_EQ(true, match); \ } while (0) -#define TEST_PARSE(x) \ +#define TEST_NMATCH(x, y) \ do { \ - ASSERT_NE(-1, Append(x)); \ - CLEAR(); \ + bool match; \ + ASSERT_NE(-1, check_match(&self->check, x, y, &match)); \ + ASSERT_EQ(false, match); \ } while (0) -#define TEST_NPARSE(x) ASSERT_EQ(-1, Append(x)) +#define TEST_PARSE(x) ASSERT_NE(-1, parse(&self->check, x)) + +#define TEST_NPARSE(x) ASSERT_EQ(-1, parse(&self->check, x)) FIXTURE(user) { + int check; }; FIXTURE_SETUP(user) { - CLEAR(); } FIXTURE_TEARDOWN(user) { - CLEAR(); + wait_for_delete(); } TEST_F(user, basic_types) { @@ -95,33 +193,30 @@ TEST_F(user, size_types) { TEST_NPARSE("u:__test_event char a 20"); } -TEST_F(user, flags) { - /* Should work */ - TEST_PARSE("u:__test_event:BPF_ITER u32 a"); - /* Forward compat */ - TEST_PARSE("u:__test_event:BPF_ITER,FLAG_FUTURE u32 a"); -} - TEST_F(user, matching) { - /* Register */ - ASSERT_NE(-1, Append("u:__test_event struct custom a 20")); - /* Should not match */ - TEST_NPARSE("!u:__test_event struct custom b"); - /* Should match */ - TEST_PARSE("!u:__test_event struct custom a"); - /* Multi field reg */ - ASSERT_NE(-1, Append("u:__test_event u32 a; u32 b")); - /* Non matching cases */ - TEST_NPARSE("!u:__test_event u32 a"); - TEST_NPARSE("!u:__test_event u32 b"); - TEST_NPARSE("!u:__test_event u32 a; u32 "); - TEST_NPARSE("!u:__test_event u32 a; u32 a"); - /* Matching case */ - TEST_PARSE("!u:__test_event u32 a; u32 b"); - /* Register */ - ASSERT_NE(-1, Append("u:__test_event u32 a; u32 b")); - /* Ensure trailing semi-colon case */ - TEST_PARSE("!u:__test_event u32 a; u32 b;"); + /* Single name matches */ + TEST_MATCH("__test_event u32 a", + "__test_event u32 a"); + + /* Multiple names match */ + TEST_MATCH("__test_event u32 a; u32 b", + "__test_event u32 a; u32 b"); + + /* Multiple names match with dangling ; */ + TEST_MATCH("__test_event u32 a; u32 b", + "__test_event u32 a; u32 b;"); + + /* Single name doesn't match */ + TEST_NMATCH("__test_event u32 a", + "__test_event u32 b"); + + /* Multiple names don't match */ + TEST_NMATCH("__test_event u32 a; u32 b", + "__test_event u32 b; u32 a"); + + /* Types don't match */ + TEST_NMATCH("__test_event u64 a; u64 b", + "__test_event u32 a; u32 b"); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index 7c99cef94a65..eb6904d89f14 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -102,30 +102,56 @@ err: return -1; } +static bool wait_for_delete(void) +{ + int i; + + for (i = 0; i < 1000; ++i) { + int fd = open(enable_file, O_RDONLY); + + if (fd == -1) + return true; + + close(fd); + usleep(1000); + } + + return false; +} + static int clear(int *check) { struct user_unreg unreg = {0}; + int fd; unreg.size = sizeof(unreg); unreg.disable_bit = 31; unreg.disable_addr = (__u64)check; - int fd = open(data_file, O_RDWR); + fd = open(data_file, O_RDWR); if (fd == -1) return -1; if (ioctl(fd, DIAG_IOCSUNREG, &unreg) == -1) if (errno != ENOENT) - return -1; - - if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) - if (errno != ENOENT) - return -1; + goto fail; + + if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) { + if (errno == EBUSY) { + if (!wait_for_delete()) + goto fail; + } else if (errno != ENOENT) + goto fail; + } close(fd); return 0; +fail: + close(fd); + + return -1; } static int check_print_fmt(const char *event, const char *expected, int *check) @@ -155,9 +181,8 @@ static int check_print_fmt(const char *event, const char *expected, int *check) /* Register should work */ ret = ioctl(fd, DIAG_IOCSREG, ®); - close(fd); - if (ret != 0) { + close(fd); printf("Reg failed in fmt\n"); return ret; } @@ -165,6 +190,8 @@ static int check_print_fmt(const char *event, const char *expected, int *check) /* Ensure correct print_fmt */ ret = get_print_fmt(print_fmt, sizeof(print_fmt)); + close(fd); + if (ret != 0) return ret; @@ -228,6 +255,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;"; + ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(EADDRINUSE, errno); + /* Ensure disabled */ self->enable_fd = open(enable_file, O_RDWR); ASSERT_NE(-1, self->enable_fd); @@ -250,10 +283,10 @@ TEST_F(user, register_events) { unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); - /* Delete should work only after close and unregister */ + /* Delete should have been auto-done after close and unregister */ close(self->data_fd); - self->data_fd = open(data_file, O_RDWR); - ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event")); + + ASSERT_EQ(true, wait_for_delete()); } TEST_F(user, write_events) { @@ -310,6 +343,39 @@ TEST_F(user, write_events) { ASSERT_EQ(EINVAL, errno); } +TEST_F(user, write_empty_events) { + struct user_reg reg = {0}; + struct iovec io[1]; + int before = 0, after = 0; + + reg.size = sizeof(reg); + reg.name_args = (__u64)"__test_event"; + reg.enable_bit = 31; + reg.enable_addr = (__u64)&self->check; + reg.enable_size = sizeof(self->check); + + io[0].iov_base = ®.write_index; + io[0].iov_len = sizeof(reg.write_index); + + /* Register should work */ + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + ASSERT_EQ(0, self->check); + + /* Enable event */ + self->enable_fd = open(enable_file, O_RDWR); + ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) + + /* Event should now be enabled */ + ASSERT_EQ(1 << reg.enable_bit, self->check); + + /* Write should make it out to ftrace buffers */ + before = trace_bytes(); + ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 1)); + after = trace_bytes(); + ASSERT_GT(after, before); +} + TEST_F(user, write_fault) { struct user_reg reg = {0}; struct iovec io[2]; diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index a070258d4449..8b09be566fa2 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -81,6 +81,32 @@ static int get_offset(void) return offset; } +static int clear(int *check) +{ + struct user_unreg unreg = {0}; + + unreg.size = sizeof(unreg); + unreg.disable_bit = 31; + unreg.disable_addr = (__u64)check; + + int fd = open(data_file, O_RDWR); + + if (fd == -1) + return -1; + + if (ioctl(fd, DIAG_IOCSUNREG, &unreg) == -1) + if (errno != ENOENT) + return -1; + + if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) + if (errno != ENOENT) + return -1; + + close(fd); + + return 0; +} + FIXTURE(user) { int data_fd; int check; @@ -93,6 +119,9 @@ FIXTURE_SETUP(user) { FIXTURE_TEARDOWN(user) { close(self->data_fd); + + if (clear(&self->check) != 0) + printf("WARNING: Clear didn't work!\n"); } TEST_F(user, perf_write) { @@ -160,6 +189,59 @@ TEST_F(user, perf_write) { ASSERT_EQ(0, self->check); } +TEST_F(user, perf_empty_events) { + struct perf_event_attr pe = {0}; + struct user_reg reg = {0}; + struct perf_event_mmap_page *perf_page; + int page_size = sysconf(_SC_PAGESIZE); + int id, fd; + __u32 *val; + + reg.size = sizeof(reg); + reg.name_args = (__u64)"__test_event"; + reg.enable_bit = 31; + reg.enable_addr = (__u64)&self->check; + reg.enable_size = sizeof(self->check); + + /* Register should work */ + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + ASSERT_EQ(0, self->check); + + /* Id should be there */ + id = get_id(); + ASSERT_NE(-1, id); + + pe.type = PERF_TYPE_TRACEPOINT; + pe.size = sizeof(pe); + pe.config = id; + pe.sample_type = PERF_SAMPLE_RAW; + pe.sample_period = 1; + pe.wakeup_events = 1; + + /* Tracepoint attach should work */ + fd = perf_event_open(&pe, 0, -1, -1, 0); + ASSERT_NE(-1, fd); + + perf_page = mmap(NULL, page_size * 2, PROT_READ, MAP_SHARED, fd, 0); + ASSERT_NE(MAP_FAILED, perf_page); + + /* Status should be updated */ + ASSERT_EQ(1 << reg.enable_bit, self->check); + + /* Ensure write shows up at correct offset */ + ASSERT_NE(-1, write(self->data_fd, ®.write_index, + sizeof(reg.write_index))); + val = (void *)(((char *)perf_page) + perf_page->data_offset); + ASSERT_EQ(PERF_RECORD_SAMPLE, *val); + + munmap(perf_page, page_size * 2); + close(fd); + + /* Status should be updated */ + ASSERT_EQ(0, self->check); +} + int main(int argc, char **argv) { return test_harness_run(argc, argv); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 479802a892d4..65f94f592ff8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -686,6 +686,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn return __kvm_handle_hva_range(kvm, &range); } + +static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + /* + * Skipping invalid memslots is correct if and only change_pte() is + * surrounded by invalidate_range_{start,end}(), which is currently + * guaranteed by the primary MMU. If that ever changes, KVM needs to + * unmap the memslot instead of skipping the memslot to ensure that KVM + * doesn't hold references to the old PFN. + */ + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + + if (range->slot->flags & KVM_MEMSLOT_INVALID) + return false; + + return kvm_set_spte_gfn(kvm, range); +} + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, @@ -707,7 +725,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, if (!READ_ONCE(kvm->mmu_invalidate_in_progress)) return; - kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); + kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn); } void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, |