diff options
899 files changed, 17735 insertions, 8834 deletions
@@ -384,6 +384,7 @@ Li Yang <[email protected]> <[email protected]> Li Yang <[email protected]> <[email protected]> Lior David <[email protected]> <[email protected]> Lorenzo Pieralisi <[email protected]> <[email protected]> +Lorenzo Stoakes <[email protected]> <[email protected]> Luca Ceresoli <[email protected]> <[email protected]> Lukasz Luba <[email protected]> <[email protected]> Luo Jie <[email protected]> <[email protected]> @@ -689,6 +690,7 @@ Vivien Didelot <[email protected]> <[email protected]> Vlad Dogaru <[email protected]> <[email protected]> Vladimir Davydov <[email protected]> <[email protected]> Vladimir Davydov <[email protected]> <[email protected]> +Weiwen Hu <[email protected]> <[email protected]> WeiXiong Liao <[email protected]> <[email protected]> Wen Gong <[email protected]> <[email protected]> Wesley Cheng <[email protected]> <[email protected]> @@ -1214,6 +1214,10 @@ D: UDF filesystem S: (ask for current address) S: USA +N: Larry Finger +D: Maintainer of wireless drivers, too many to list here + N: Jürgen Fischer D: Author of Adaptec AHA-152x SCSI driver @@ -3146,9 +3150,11 @@ S: Triftstra=DFe 55 S: 13353 Berlin S: Germany -N: Gustavo Pimental +N: Gustavo Pimentel D: PCI driver for Synopsys DesignWare +D: Synopsys DesignWare eDMA driver +D: Synopsys DesignWare xData traffic generator N: Emanuel Pirker diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 831f19a32e08..cea8856f798d 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -21,6 +21,59 @@ Description: device is offset from the internal allocation unit's natural alignment. +What: /sys/block/<disk>/atomic_write_max_bytes +Date: February 2024 +Contact: Himanshu Madhani <[email protected]> +Description: + [RO] This parameter specifies the maximum atomic write + size reported by the device. This parameter is relevant + for merging of writes, where a merged atomic write + operation must not exceed this number of bytes. + This parameter may be greater than the value in + atomic_write_unit_max_bytes as + atomic_write_unit_max_bytes will be rounded down to a + power-of-two and atomic_write_unit_max_bytes may also be + limited by some other queue limits, such as max_segments. + This parameter - along with atomic_write_unit_min_bytes + and atomic_write_unit_max_bytes - will not be larger than + max_hw_sectors_kb, but may be larger than max_sectors_kb. + + +What: /sys/block/<disk>/atomic_write_unit_min_bytes +Date: February 2024 +Contact: Himanshu Madhani <[email protected]> +Description: + [RO] This parameter specifies the smallest block which can + be written atomically with an atomic write operation. All + atomic write operations must begin at a + atomic_write_unit_min boundary and must be multiples of + atomic_write_unit_min. This value must be a power-of-two. + + +What: /sys/block/<disk>/atomic_write_unit_max_bytes +Date: February 2024 +Contact: Himanshu Madhani <[email protected]> +Description: + [RO] This parameter defines the largest block which can be + written atomically with an atomic write operation. This + value must be a multiple of atomic_write_unit_min and must + be a power-of-two. This value will not be larger than + atomic_write_max_bytes. + + +What: /sys/block/<disk>/atomic_write_boundary_bytes +Date: February 2024 +Contact: Himanshu Madhani <[email protected]> +Description: + [RO] A device may need to internally split an atomic write I/O + which straddles a given logical block address boundary. This + parameter specifies the size in bytes of the atomic boundary if + one is reported by the device. This value must be a + power-of-two and at least the size as in + atomic_write_unit_max_bytes. + Any attempt to merge atomic write I/Os must not result in a + merged I/O which crosses this boundary (if any). + What: /sys/block/<disk>/diskseq Date: February 2021 diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 5750f125361b..728b1e690c64 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -149,9 +149,9 @@ This case is handled by calls to the strongly ordered ``atomic_add_return()`` read-modify-write atomic operation that is invoked within ``rcu_dynticks_eqs_enter()`` at idle-entry time and within ``rcu_dynticks_eqs_exit()`` at idle-exit time. -The grace-period kthread invokes ``rcu_dynticks_snap()`` and -``rcu_dynticks_in_eqs_since()`` (both of which invoke -an ``atomic_add_return()`` of zero) to detect idle CPUs. +The grace-period kthread invokes first ``ct_dynticks_cpu_acquire()`` +(preceded by a full memory barrier) and ``rcu_dynticks_in_eqs_since()`` +(both of which rely on acquire semantics) to detect idle CPUs. +-----------------------------------------------------------------------+ | **Quick Quiz**: | diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index cccafdaa1f84..f511476b4550 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -2357,6 +2357,7 @@ section. #. `Sched Flavor (Historical)`_ #. `Sleepable RCU`_ #. `Tasks RCU`_ +#. `Tasks Trace RCU`_ Bottom-Half Flavor (Historical) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2610,6 +2611,16 @@ critical sections that are delimited by voluntary context switches, that is, calls to schedule(), cond_resched(), and synchronize_rcu_tasks(). In addition, transitions to and from userspace execution also delimit tasks-RCU read-side critical sections. +Idle tasks are ignored by Tasks RCU, and Tasks Rude RCU may be used to +interact with them. + +Note well that involuntary context switches are *not* Tasks-RCU quiescent +states. After all, in preemptible kernels, a task executing code in a +trampoline might be preempted. In this case, the Tasks-RCU grace period +clearly cannot end until that task resumes and its execution leaves that +trampoline. This means, among other things, that cond_resched() does +not provide a Tasks RCU quiescent state. (Instead, use rcu_softirq_qs() +from softirq or rcu_tasks_classic_qs() otherwise.) The tasks-RCU API is quite compact, consisting only of call_rcu_tasks(), synchronize_rcu_tasks(), and @@ -2632,6 +2643,11 @@ moniker. And this operation is considered to be quite rude by real-time workloads that don't want their ``nohz_full`` CPUs receiving IPIs and by battery-powered systems that don't want their idle CPUs to be awakened. +Once kernel entry/exit and deep-idle functions have been properly tagged +``noinstr``, Tasks RCU can start paying attention to idle tasks (except +those that are idle from RCU's perspective) and then Tasks Rude RCU can +be removed from the kernel. + The tasks-rude-RCU API is also reader-marking-free and thus quite compact, consisting of call_rcu_tasks_rude(), synchronize_rcu_tasks_rude(), and rcu_barrier_tasks_rude(). diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 94838c65c7d9..d585a5490aee 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -250,21 +250,25 @@ rcu_assign_pointer() ^^^^^^^^^^^^^^^^^^^^ void rcu_assign_pointer(p, typeof(p) v); - Yes, rcu_assign_pointer() **is** implemented as a macro, though it - would be cool to be able to declare a function in this manner. - (Compiler experts will no doubt disagree.) + Yes, rcu_assign_pointer() **is** implemented as a macro, though + it would be cool to be able to declare a function in this manner. + (And there has been some discussion of adding overloaded functions + to the C language, so who knows?) The updater uses this spatial macro to assign a new value to an RCU-protected pointer, in order to safely communicate the change in value from the updater to the reader. This is a spatial (as opposed to temporal) macro. It does not evaluate to an rvalue, - but it does execute any memory-barrier instructions required - for a given CPU architecture. Its ordering properties are that - of a store-release operation. - - Perhaps just as important, it serves to document (1) which - pointers are protected by RCU and (2) the point at which a - given structure becomes accessible to other CPUs. That said, + but it does provide any compiler directives and memory-barrier + instructions required for a given compile or CPU architecture. + Its ordering properties are that of a store-release operation, + that is, any prior loads and stores required to initialize the + structure are ordered before the store that publishes the pointer + to that structure. + + Perhaps just as important, rcu_assign_pointer() serves to document + (1) which pointers are protected by RCU and (2) the point at which + a given structure becomes accessible to other CPUs. That said, rcu_assign_pointer() is most frequently used indirectly, via the _rcu list-manipulation primitives such as list_add_rcu(). @@ -283,7 +287,11 @@ rcu_dereference() executes any needed memory-barrier instructions for a given CPU architecture. Currently, only Alpha needs memory barriers within rcu_dereference() -- on other CPUs, it compiles to a - volatile load. + volatile load. However, no mainstream C compilers respect + address dependencies, so rcu_dereference() uses volatile casts, + which, in combination with the coding guidelines listed in + rcu_dereference.rst, prevent current compilers from breaking + these dependencies. Common coding practice uses rcu_dereference() to copy an RCU-protected pointer to a local variable, then dereferences diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst index 6acebd9e72c8..0f9f9a7b1f6c 100644 --- a/Documentation/admin-guide/cgroup-v1/pids.rst +++ b/Documentation/admin-guide/cgroup-v1/pids.rst @@ -36,7 +36,8 @@ superset of parent/child/pids.current. The pids.events file contains event counters: - - max: Number of times fork failed because limit was hit. + - max: Number of times fork failed in the cgroup because limit was hit in + self or ancestors. Example ------- diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8fbb0519d556..05862f06ed26 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -239,6 +239,13 @@ cgroup v2 currently supports the following mount options. will not be tracked by the memory controller (even if cgroup v2 is remounted later on). + pids_localevents + The option restores v1-like behavior of pids.events:max, that is only + local (inside cgroup proper) fork failures are counted. Without this + option pids.events.max represents any pids.max enforcemnt across + cgroup's subtree. + + Organizing Processes and Threads -------------------------------- @@ -2205,12 +2212,18 @@ PID Interface Files descendants has ever reached. pids.events - A read-only flat-keyed file which exists on non-root cgroups. The - following entries are defined. Unless specified otherwise, a value - change in this file generates a file modified event. + A read-only flat-keyed file which exists on non-root cgroups. Unless + specified otherwise, a value change in this file generates a file + modified event. The following entries are defined. max - Number of times fork failed because limit was hit. + The number of times the cgroup's total number of processes hit the pids.max + limit (see also pids_localevents). + + pids.events.local + Similar to pids.events but the fields in the file are local + to the cgroup i.e. not hierarchical. The file modified event + generated on this file reflects only the local events. Organisational operations are not blocked by cgroup policies, so it is possible to have pids.current > pids.max. This can be done by either @@ -2346,8 +2359,12 @@ Cpuset Interface Files is always a subset of it. Users can manually set it to a value that is different from - "cpuset.cpus". The only constraint in setting it is that the - list of CPUs must be exclusive with respect to its sibling. + "cpuset.cpus". One constraint in setting it is that the list of + CPUs must be exclusive with respect to "cpuset.cpus.exclusive" + of its sibling. If "cpuset.cpus.exclusive" of a sibling cgroup + isn't set, its "cpuset.cpus" value, if set, cannot be a subset + of it to leave at least one CPU available when the exclusive + CPUs are taken away. For a parent cgroup, any one of its exclusive CPUs can only be distributed to at most one of its child cgroups. Having an @@ -2363,8 +2380,8 @@ Cpuset Interface Files cpuset-enabled cgroups. This file shows the effective set of exclusive CPUs that - can be used to create a partition root. The content of this - file will always be a subset of "cpuset.cpus" and its parent's + can be used to create a partition root. The content + of this file will always be a subset of its parent's "cpuset.cpus.exclusive.effective" if its parent is not the root cgroup. It will also be a subset of "cpuset.cpus.exclusive" if it is set. If "cpuset.cpus.exclusive" is not set, it is @@ -2625,6 +2642,15 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_ res_a 3 res_b 0 + misc.peak + A read-only flat-keyed file shown in all cgroups. It shows the + historical maximum usage of the resources in the cgroup and its + children.:: + + $ cat misc.peak + res_a 10 + res_b 8 + misc.max A read-write flat-keyed file shown in the non root cgroups. Allowed maximum usage of the resources in the cgroup and its children.:: @@ -2654,6 +2680,11 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_ The number of times the cgroup's resource usage was about to go over the max boundary. + misc.events.local + Similar to misc.events but the fields in the file are local to the + cgroup i.e. not hierarchical. The file modified event generated on + this file reflects only the local events. + Migration and Ownership ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst index aa8290a29dc8..fd4b56c0996f 100644 --- a/Documentation/admin-guide/cifs/usage.rst +++ b/Documentation/admin-guide/cifs/usage.rst @@ -723,40 +723,26 @@ Configuration pseudo-files: ======================= ======================================================= SecurityFlags Flags which control security negotiation and also packet signing. Authentication (may/must) - flags (e.g. for NTLM and/or NTLMv2) may be combined with + flags (e.g. for NTLMv2) may be combined with the signing flags. Specifying two different password hashing mechanisms (as "must use") on the other hand does not make much sense. Default flags are:: - 0x07007 - - (NTLM, NTLMv2 and packet signing allowed). The maximum - allowable flags if you want to allow mounts to servers - using weaker password hashes is 0x37037 (lanman, - plaintext, ntlm, ntlmv2, signing allowed). Some - SecurityFlags require the corresponding menuconfig - options to be enabled. Enabling plaintext - authentication currently requires also enabling - lanman authentication in the security flags - because the cifs module only supports sending - laintext passwords using the older lanman dialect - form of the session setup SMB. (e.g. for authentication - using plain text passwords, set the SecurityFlags - to 0x30030):: + 0x00C5 + + (NTLMv2 and packet signing allowed). Some SecurityFlags + may require enabling a corresponding menuconfig option. may use packet signing 0x00001 must use packet signing 0x01001 - may use NTLM (most common password hash) 0x00002 - must use NTLM 0x02002 may use NTLMv2 0x00004 must use NTLMv2 0x04004 - may use Kerberos security 0x00008 - must use Kerberos 0x08008 - may use lanman (weak) password hash 0x00010 - must use lanman password hash 0x10010 - may use plaintext passwords 0x00020 - must use plaintext passwords 0x20020 - (reserved for future packet encryption) 0x00040 + may use Kerberos security (krb5) 0x00008 + must use Kerberos 0x08008 + may use NTLMSSP 0x00080 + must use NTLMSSP 0x80080 + seal (packet encryption) 0x00040 + must seal (not implemented yet) 0x40040 cifsFYI If set to non-zero value, additional debug information will be logged to the system error log. This field diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 27ec49af1bf2..fc01da74cf99 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5015,6 +5015,14 @@ the ->nocb_bypass queue. The definition of "too many" is supplied by this kernel boot parameter. + rcutree.nohz_full_patience_delay= [KNL] + On callback-offloaded (rcu_nocbs) CPUs, avoid + disturbing RCU unless the grace period has + reached the specified age in milliseconds. + Defaults to zero. Large values will be capped + at five seconds. All values will be rounded down + to the nearest value representable by jiffies. + rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks beyond which batch limiting is disabled. diff --git a/Documentation/arch/riscv/cmodx.rst b/Documentation/arch/riscv/cmodx.rst index 1c0ca06b6c97..8c48bcff3df9 100644 --- a/Documentation/arch/riscv/cmodx.rst +++ b/Documentation/arch/riscv/cmodx.rst @@ -62,10 +62,10 @@ cmodx.c:: printf("Value before cmodx: %d\n", value); // Call prctl before first fence.i is called inside modify_instruction - prctl(PR_RISCV_SET_ICACHE_FLUSH_CTX_ON, PR_RISCV_CTX_SW_FENCEI, PR_RISCV_SCOPE_PER_PROCESS); + prctl(PR_RISCV_SET_ICACHE_FLUSH_CTX, PR_RISCV_CTX_SW_FENCEI_ON, PR_RISCV_SCOPE_PER_PROCESS); modify_instruction(); // Call prctl after final fence.i is called in process - prctl(PR_RISCV_SET_ICACHE_FLUSH_CTX_OFF, PR_RISCV_CTX_SW_FENCEI, PR_RISCV_SCOPE_PER_PROCESS); + prctl(PR_RISCV_SET_ICACHE_FLUSH_CTX, PR_RISCV_CTX_SW_FENCEI_OFF, PR_RISCV_SCOPE_PER_PROCESS); value = get_value(); printf("Value after cmodx: %d\n", value); diff --git a/Documentation/block/data-integrity.rst b/Documentation/block/data-integrity.rst index 6a760c0eb192..99905e880a0e 100644 --- a/Documentation/block/data-integrity.rst +++ b/Documentation/block/data-integrity.rst @@ -153,18 +153,11 @@ bio_free() will automatically free the bip. 4.2 Block Device ---------------- -Because the format of the protection data is tied to the physical -disk, each block device has been extended with a block integrity -profile (struct blk_integrity). This optional profile is registered -with the block layer using blk_integrity_register(). - -The profile contains callback functions for generating and verifying -the protection data, as well as getting and setting application tags. -The profile also contains a few constants to aid in completing, -merging and splitting the integrity metadata. +Block devices can set up the integrity information in the integrity +sub-struture of the queue_limits structure. Layered block devices will need to pick a profile that's appropriate -for all subdevices. blk_integrity_compare() can help with that. DM +for all subdevices. queue_limits_stack_integrity() can help with that. DM and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6 will require extra work due to the application tag. @@ -250,42 +243,6 @@ will require extra work due to the application tag. integrity upon completion. -5.4 Registering A Block Device As Capable Of Exchanging Integrity Metadata --------------------------------------------------------------------------- - - To enable integrity exchange on a block device the gendisk must be - registered as capable: - - `int blk_integrity_register(gendisk, blk_integrity);` - - The blk_integrity struct is a template and should contain the - following:: - - static struct blk_integrity my_profile = { - .name = "STANDARDSBODY-TYPE-VARIANT-CSUM", - .generate_fn = my_generate_fn, - .verify_fn = my_verify_fn, - .tuple_size = sizeof(struct my_tuple_size), - .tag_size = <tag bytes per hw sector>, - }; - - 'name' is a text string which will be visible in sysfs. This is - part of the userland API so chose it carefully and never change - it. The format is standards body-type-variant. - E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC. - - 'generate_fn' generates appropriate integrity metadata (for WRITE). - - 'verify_fn' verifies that the data buffer matches the integrity - metadata. - - 'tuple_size' must be set to match the size of the integrity - metadata per sector. I.e. 8 for DIF and EPP. - - 'tag_size' must be set to identify how many bytes of tag space - are available per hardware sector. For DIF this is either 2 or - 0 depending on the value of the Control Mode Page ATO bit. - ---------------------------------------------------------------------- 2007-12-24 Martin K. Petersen <[email protected]> diff --git a/Documentation/block/writeback_cache_control.rst b/Documentation/block/writeback_cache_control.rst index b208488d0aae..c3707d071780 100644 --- a/Documentation/block/writeback_cache_control.rst +++ b/Documentation/block/writeback_cache_control.rst @@ -46,41 +46,50 @@ worry if the underlying devices need any explicit cache flushing and how the Forced Unit Access is implemented. The REQ_PREFLUSH and REQ_FUA flags may both be set on a single bio. +Feature settings for block drivers +---------------------------------- -Implementation details for bio based block drivers --------------------------------------------------------------- +For devices that do not support volatile write caches there is no driver +support required, the block layer completes empty REQ_PREFLUSH requests before +entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from +requests that have a payload. -These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit -directly below the submit_bio interface. For remapping drivers the REQ_FUA -bits need to be propagated to underlying devices, and a global flush needs -to be implemented for bios with the REQ_PREFLUSH bit set. For real device -drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits -on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without -data can be completed successfully without doing any work. Drivers for -devices with volatile caches need to implement the support for these -flags themselves without any help from the block layer. +For devices with volatile write caches the driver needs to tell the block layer +that it supports flushing caches by setting the + BLK_FEAT_WRITE_CACHE -Implementation details for request_fn based block drivers ---------------------------------------------------------- +flag in the queue_limits feature field. For devices that also support the FUA +bit the block layer needs to be told to pass on the REQ_FUA bit by also setting +the -For devices that do not support volatile write caches there is no driver -support required, the block layer completes empty REQ_PREFLUSH requests before -entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from -requests that have a payload. For devices with volatile write caches the -driver needs to tell the block layer that it supports flushing caches by -doing:: + BLK_FEAT_FUA + +flag in the features field of the queue_limits structure. + +Implementation details for bio based block drivers +-------------------------------------------------- + +For bio based drivers the REQ_PREFLUSH and REQ_FUA bit are simply passed on to +the driver if the driver sets the BLK_FEAT_WRITE_CACHE flag and the driver +needs to handle them. + +*NOTE*: The REQ_FUA bit also gets passed on when the BLK_FEAT_FUA flags is +_not_ set. Any bio based driver that sets BLK_FEAT_WRITE_CACHE also needs to +handle REQ_FUA. - blk_queue_write_cache(sdkp->disk->queue, true, false); +For remapping drivers the REQ_FUA bits need to be propagated to underlying +devices, and a global flush needs to be implemented for bios with the +REQ_PREFLUSH bit set. -and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn. Note that -REQ_PREFLUSH requests with a payload are automatically turned into a sequence -of an empty REQ_OP_FLUSH request followed by the actual write by the block -layer. For devices that also support the FUA bit the block layer needs -to be told to pass through the REQ_FUA bit using:: +Implementation details for blk-mq drivers +----------------------------------------- - blk_queue_write_cache(sdkp->disk->queue, true, true); +When the BLK_FEAT_WRITE_CACHE flag is set, REQ_OP_WRITE | REQ_PREFLUSH requests +with a payload are automatically turned into a sequence of a REQ_OP_FLUSH +request followed by the actual write by the block layer. -and the driver must handle write requests that have the REQ_FUA bit set -in prep_fn/request_fn. If the FUA bit is not natively supported the block -layer turns it into an empty REQ_OP_FLUSH request after the actual write. +When the BLK_FEAT_FUA flags is set, the REQ_FUA bit is simply passed on for the +REQ_OP_WRITE request, else a REQ_OP_FLUSH request is sent by the block layer +after the completion of the write request for bio submissions with the REQ_FUA +bit set. diff --git a/Documentation/devicetree/bindings/cache/qcom,llcc.yaml b/Documentation/devicetree/bindings/cache/qcom,llcc.yaml index 07ccbda4a0ab..b9a9f2cf32a1 100644 --- a/Documentation/devicetree/bindings/cache/qcom,llcc.yaml +++ b/Documentation/devicetree/bindings/cache/qcom,llcc.yaml @@ -66,7 +66,6 @@ allOf: compatible: contains: enum: - - qcom,qdu1000-llcc - qcom,sc7180-llcc - qcom,sm6350-llcc then: @@ -104,6 +103,7 @@ allOf: compatible: contains: enum: + - qcom,qdu1000-llcc - qcom,sc8180x-llcc - qcom,sc8280xp-llcc - qcom,x1e80100-llcc diff --git a/Documentation/devicetree/bindings/timer/realtek,otto-timer.yaml b/Documentation/devicetree/bindings/timer/realtek,otto-timer.yaml new file mode 100644 index 000000000000..7b6ec2c69484 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/realtek,otto-timer.yaml @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/realtek,otto-timer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Realtek Otto SoCs Timer/Counter + +description: + Realtek SoCs support a number of timers/counters. These are used + as a per CPU clock event generator and an overall CPU clocksource. + +maintainers: + - Chris Packham <[email protected]> + +properties: + $nodename: + pattern: "^timer@[0-9a-f]+$" + + compatible: + items: + - enum: + - realtek,rtl9302-timer + - const: realtek,otto-timer + + reg: + items: + - description: timer0 registers + - description: timer1 registers + - description: timer2 registers + - description: timer3 registers + - description: timer4 registers + + clocks: + maxItems: 1 + + interrupts: + items: + - description: timer0 interrupt + - description: timer1 interrupt + - description: timer2 interrupt + - description: timer3 interrupt + - description: timer4 interrupt + +required: + - compatible + - reg + - clocks + - interrupts + +additionalProperties: false + +examples: + - | + timer@3200 { + compatible = "realtek,rtl9302-timer", "realtek,otto-timer"; + reg = <0x3200 0x10>, <0x3210 0x10>, <0x3220 0x10>, + <0x3230 0x10>, <0x3240 0x10>; + + interrupt-parent = <&intc>; + interrupts = <7>, <8>, <9>, <10>, <11>; + clocks = <&lx_clk>; + }; diff --git a/Documentation/devicetree/bindings/timer/renesas,tmu.yaml b/Documentation/devicetree/bindings/timer/renesas,tmu.yaml index 360a5cf1ae9c..b6dd98d956f3 100644 --- a/Documentation/devicetree/bindings/timer/renesas,tmu.yaml +++ b/Documentation/devicetree/bindings/timer/renesas,tmu.yaml @@ -21,13 +21,24 @@ properties: compatible: items: - enum: + - renesas,tmu-r8a73a4 # R-Mobile APE6 - renesas,tmu-r8a7740 # R-Mobile A1 + - renesas,tmu-r8a7742 # RZ/G1H + - renesas,tmu-r8a7743 # RZ/G1M + - renesas,tmu-r8a7744 # RZ/G1N + - renesas,tmu-r8a7745 # RZ/G1E + - renesas,tmu-r8a77470 # RZ/G1C - renesas,tmu-r8a774a1 # RZ/G2M - renesas,tmu-r8a774b1 # RZ/G2N - renesas,tmu-r8a774c0 # RZ/G2E - renesas,tmu-r8a774e1 # RZ/G2H - renesas,tmu-r8a7778 # R-Car M1A - renesas,tmu-r8a7779 # R-Car H1 + - renesas,tmu-r8a7790 # R-Car H2 + - renesas,tmu-r8a7791 # R-Car M2-W + - renesas,tmu-r8a7792 # R-Car V2H + - renesas,tmu-r8a7793 # R-Car M2-N + - renesas,tmu-r8a7794 # R-Car E2 - renesas,tmu-r8a7795 # R-Car H3 - renesas,tmu-r8a7796 # R-Car M3-W - renesas,tmu-r8a77961 # R-Car M3-W+ @@ -94,6 +105,7 @@ if: compatible: contains: enum: + - renesas,tmu-r8a73a4 - renesas,tmu-r8a7740 - renesas,tmu-r8a7778 - renesas,tmu-r8a7779 diff --git a/Documentation/devicetree/bindings/timer/sifive,clint.yaml b/Documentation/devicetree/bindings/timer/sifive,clint.yaml index fced6f2d8ecb..b42d43d2de48 100644 --- a/Documentation/devicetree/bindings/timer/sifive,clint.yaml +++ b/Documentation/devicetree/bindings/timer/sifive,clint.yaml @@ -40,6 +40,7 @@ properties: - allwinner,sun20i-d1-clint - sophgo,cv1800b-clint - sophgo,cv1812h-clint + - sophgo,sg2002-clint - thead,th1520-clint - const: thead,c900-clint - items: diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst index 5149ecdc53c7..d732c42526df 100644 --- a/Documentation/driver-api/cxl/memory-devices.rst +++ b/Documentation/driver-api/cxl/memory-devices.rst @@ -328,6 +328,12 @@ CXL Memory Device .. kernel-doc:: drivers/cxl/mem.c :doc: cxl mem +.. kernel-doc:: drivers/cxl/cxlmem.h + :internal: + +.. kernel-doc:: drivers/cxl/core/memdev.c + :identifiers: + CXL Port -------- .. kernel-doc:: drivers/cxl/port.c @@ -341,6 +347,15 @@ CXL Core .. kernel-doc:: drivers/cxl/cxl.h :internal: +.. kernel-doc:: drivers/cxl/core/hdm.c + :doc: cxl core hdm + +.. kernel-doc:: drivers/cxl/core/hdm.c + :identifiers: + +.. kernel-doc:: drivers/cxl/core/cdat.c + :identifiers: + .. kernel-doc:: drivers/cxl/core/port.c :doc: cxl core diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 8f5c1ee02e2f..e8e496d23e1d 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -34,6 +34,7 @@ algorithms work. seq_file sharedsubtree idmappings + iomap/index automount-support diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst new file mode 100644 index 000000000000..f8ee3427bc1a --- /dev/null +++ b/Documentation/filesystems/iomap/design.rst @@ -0,0 +1,441 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. _iomap_design: + +.. + Dumb style notes to maintain the author's sanity: + Please try to start sentences on separate lines so that + sentence changes don't bleed colors in diff. + Heading decorations are documented in sphinx.rst. + +============== +Library Design +============== + +.. contents:: Table of Contents + :local: + +Introduction +============ + +iomap is a filesystem library for handling common file operations. +The library has two layers: + + 1. A lower layer that provides an iterator over ranges of file offsets. + This layer tries to obtain mappings of each file ranges to storage + from the filesystem, but the storage information is not necessarily + required. + + 2. An upper layer that acts upon the space mappings provided by the + lower layer iterator. + +The iteration can involve mappings of file's logical offset ranges to +physical extents, but the storage layer information is not necessarily +required, e.g. for walking cached file information. +The library exports various APIs for implementing file operations such +as: + + * Pagecache reads and writes + * Folio write faults to the pagecache + * Writeback of dirty folios + * Direct I/O reads and writes + * fsdax I/O reads, writes, loads, and stores + * FIEMAP + * lseek ``SEEK_DATA`` and ``SEEK_HOLE`` + * swapfile activation + +This origins of this library is the file I/O path that XFS once used; it +has now been extended to cover several other operations. + +Who Should Read This? +===================== + +The target audience for this document are filesystem, storage, and +pagecache programmers and code reviewers. + +If you are working on PCI, machine architectures, or device drivers, you +are most likely in the wrong place. + +How Is This Better? +=================== + +Unlike the classic Linux I/O model which breaks file I/O into small +units (generally memory pages or blocks) and looks up space mappings on +the basis of that unit, the iomap model asks the filesystem for the +largest space mappings that it can create for a given file operation and +initiates operations on that basis. +This strategy improves the filesystem's visibility into the size of the +operation being performed, which enables it to combat fragmentation with +larger space allocations when possible. +Larger space mappings improve runtime performance by amortizing the cost +of mapping function calls into the filesystem across a larger amount of +data. + +At a high level, an iomap operation `looks like this +<https://lore.kernel.org/all/[email protected]/>`_: + +1. For each byte in the operation range... + + 1. Obtain a space mapping via ``->iomap_begin`` + + 2. For each sub-unit of work... + + 1. Revalidate the mapping and go back to (1) above, if necessary. + So far only the pagecache operations need to do this. + + 2. Do the work + + 3. Increment operation cursor + + 4. Release the mapping via ``->iomap_end``, if necessary + +Each iomap operation will be covered in more detail below. +This library was covered previously by an `LWN article +<https://lwn.net/Articles/935934/>`_ and a `KernelNewbies page +<https://kernelnewbies.org/KernelProjects/iomap>`_. + +The goal of this document is to provide a brief discussion of the +design and capabilities of iomap, followed by a more detailed catalog +of the interfaces presented by iomap. +If you change iomap, please update this design document. + +File Range Iterator +=================== + +Definitions +----------- + + * **buffer head**: Shattered remnants of the old buffer cache. + + * ``fsblock``: The block size of a file, also known as ``i_blocksize``. + + * ``i_rwsem``: The VFS ``struct inode`` rwsemaphore. + Processes hold this in shared mode to read file state and contents. + Some filesystems may allow shared mode for writes. + Processes often hold this in exclusive mode to change file state and + contents. + + * ``invalidate_lock``: The pagecache ``struct address_space`` + rwsemaphore that protects against folio insertion and removal for + filesystems that support punching out folios below EOF. + Processes wishing to insert folios must hold this lock in shared + mode to prevent removal, though concurrent insertion is allowed. + Processes wishing to remove folios must hold this lock in exclusive + mode to prevent insertions. + Concurrent removals are not allowed. + + * ``dax_read_lock``: The RCU read lock that dax takes to prevent a + device pre-shutdown hook from returning before other threads have + released resources. + + * **filesystem mapping lock**: This synchronization primitive is + internal to the filesystem and must protect the file mapping data + from updates while a mapping is being sampled. + The filesystem author must determine how this coordination should + happen; it does not need to be an actual lock. + + * **iomap internal operation lock**: This is a general term for + synchronization primitives that iomap functions take while holding a + mapping. + A specific example would be taking the folio lock while reading or + writing the pagecache. + + * **pure overwrite**: A write operation that does not require any + metadata or zeroing operations to perform during either submission + or completion. + This implies that the fileystem must have already allocated space + on disk as ``IOMAP_MAPPED`` and the filesystem must not place any + constaints on IO alignment or size. + The only constraints on I/O alignment are device level (minimum I/O + size and alignment, typically sector size). + +``struct iomap`` +---------------- + +The filesystem communicates to the iomap iterator the mapping of +byte ranges of a file to byte ranges of a storage device with the +structure below: + +.. code-block:: c + + struct iomap { + u64 addr; + loff_t offset; + u64 length; + u16 type; + u16 flags; + struct block_device *bdev; + struct dax_device *dax_dev; + voidw *inline_data; + void *private; + const struct iomap_folio_ops *folio_ops; + u64 validity_cookie; + }; + +The fields are as follows: + + * ``offset`` and ``length`` describe the range of file offsets, in + bytes, covered by this mapping. + These fields must always be set by the filesystem. + + * ``type`` describes the type of the space mapping: + + * **IOMAP_HOLE**: No storage has been allocated. + This type must never be returned in response to an ``IOMAP_WRITE`` + operation because writes must allocate and map space, and return + the mapping. + The ``addr`` field must be set to ``IOMAP_NULL_ADDR``. + iomap does not support writing (whether via pagecache or direct + I/O) to a hole. + + * **IOMAP_DELALLOC**: A promise to allocate space at a later time + ("delayed allocation"). + If the filesystem returns IOMAP_F_NEW here and the write fails, the + ``->iomap_end`` function must delete the reservation. + The ``addr`` field must be set to ``IOMAP_NULL_ADDR``. + + * **IOMAP_MAPPED**: The file range maps to specific space on the + storage device. + The device is returned in ``bdev`` or ``dax_dev``. + The device address, in bytes, is returned via ``addr``. + + * **IOMAP_UNWRITTEN**: The file range maps to specific space on the + storage device, but the space has not yet been initialized. + The device is returned in ``bdev`` or ``dax_dev``. + The device address, in bytes, is returned via ``addr``. + Reads from this type of mapping will return zeroes to the caller. + For a write or writeback operation, the ioend should update the + mapping to MAPPED. + Refer to the sections about ioends for more details. + + * **IOMAP_INLINE**: The file range maps to the memory buffer + specified by ``inline_data``. + For write operation, the ``->iomap_end`` function presumably + handles persisting the data. + The ``addr`` field must be set to ``IOMAP_NULL_ADDR``. + + * ``flags`` describe the status of the space mapping. + These flags should be set by the filesystem in ``->iomap_begin``: + + * **IOMAP_F_NEW**: The space under the mapping is newly allocated. + Areas that will not be written to must be zeroed. + If a write fails and the mapping is a space reservation, the + reservation must be deleted. + + * **IOMAP_F_DIRTY**: The inode will have uncommitted metadata needed + to access any data written. + fdatasync is required to commit these changes to persistent + storage. + This needs to take into account metadata changes that *may* be made + at I/O completion, such as file size updates from direct I/O. + + * **IOMAP_F_SHARED**: The space under the mapping is shared. + Copy on write is necessary to avoid corrupting other file data. + + * **IOMAP_F_BUFFER_HEAD**: This mapping requires the use of buffer + heads for pagecache operations. + Do not add more uses of this. + + * **IOMAP_F_MERGED**: Multiple contiguous block mappings were + coalesced into this single mapping. + This is only useful for FIEMAP. + + * **IOMAP_F_XATTR**: The mapping is for extended attribute data, not + regular file data. + This is only useful for FIEMAP. + + * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can + be set by the filesystem for its own purposes. + + These flags can be set by iomap itself during file operations. + The filesystem should supply an ``->iomap_end`` function if it needs + to observe these flags: + + * **IOMAP_F_SIZE_CHANGED**: The file size has changed as a result of + using this mapping. + + * **IOMAP_F_STALE**: The mapping was found to be stale. + iomap will call ``->iomap_end`` on this mapping and then + ``->iomap_begin`` to obtain a new mapping. + + Currently, these flags are only set by pagecache operations. + + * ``addr`` describes the device address, in bytes. + + * ``bdev`` describes the block device for this mapping. + This only needs to be set for mapped or unwritten operations. + + * ``dax_dev`` describes the DAX device for this mapping. + This only needs to be set for mapped or unwritten operations, and + only for a fsdax operation. + + * ``inline_data`` points to a memory buffer for I/O involving + ``IOMAP_INLINE`` mappings. + This value is ignored for all other mapping types. + + * ``private`` is a pointer to `filesystem-private information + <https://lore.kernel.org/all/[email protected]/>`_. + This value will be passed unchanged to ``->iomap_end``. + + * ``folio_ops`` will be covered in the section on pagecache operations. + + * ``validity_cookie`` is a magic freshness value set by the filesystem + that should be used to detect stale mappings. + For pagecache operations this is critical for correct operation + because page faults can occur, which implies that filesystem locks + should not be held between ``->iomap_begin`` and ``->iomap_end``. + Filesystems with completely static mappings need not set this value. + Only pagecache operations revalidate mappings; see the section about + ``iomap_valid`` for details. + +``struct iomap_ops`` +-------------------- + +Every iomap function requires the filesystem to pass an operations +structure to obtain a mapping and (optionally) to release the mapping: + +.. code-block:: c + + struct iomap_ops { + int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap, + struct iomap *srcmap); + + int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned flags, + struct iomap *iomap); + }; + +``->iomap_begin`` +~~~~~~~~~~~~~~~~~ + +iomap operations call ``->iomap_begin`` to obtain one file mapping for +the range of bytes specified by ``pos`` and ``length`` for the file +``inode``. +This mapping should be returned through the ``iomap`` pointer. +The mapping must cover at least the first byte of the supplied file +range, but it does not need to cover the entire requested range. + +Each iomap operation describes the requested operation through the +``flags`` argument. +The exact value of ``flags`` will be documented in the +operation-specific sections below. +These flags can, at least in principle, apply generally to iomap +operations: + + * ``IOMAP_DIRECT`` is set when the caller wishes to issue file I/O to + block storage. + + * ``IOMAP_DAX`` is set when the caller wishes to issue file I/O to + memory-like storage. + + * ``IOMAP_NOWAIT`` is set when the caller wishes to perform a best + effort attempt to avoid any operation that would result in blocking + the submitting task. + This is similar in intent to ``O_NONBLOCK`` for network APIs - it is + intended for asynchronous applications to keep doing other work + instead of waiting for the specific unavailable filesystem resource + to become available. + Filesystems implementing ``IOMAP_NOWAIT`` semantics need to use + trylock algorithms. + They need to be able to satisfy the entire I/O request range with a + single iomap mapping. + They need to avoid reading or writing metadata synchronously. + They need to avoid blocking memory allocations. + They need to avoid waiting on transaction reservations to allow + modifications to take place. + They probably should not be allocating new space. + And so on. + If there is any doubt in the filesystem developer's mind as to + whether any specific ``IOMAP_NOWAIT`` operation may end up blocking, + then they should return ``-EAGAIN`` as early as possible rather than + start the operation and force the submitting task to block. + ``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or + ``RWF_NOWAIT``. + +If it is necessary to read existing file contents from a `different +<https://lore.kernel.org/all/[email protected]/>`_ +device or address range on a device, the filesystem should return that +information via ``srcmap``. +Only pagecache and fsdax operations support reading from one mapping and +writing to another. + +``->iomap_end`` +~~~~~~~~~~~~~~~ + +After the operation completes, the ``->iomap_end`` function, if present, +is called to signal that iomap is finished with a mapping. +Typically, implementations will use this function to tear down any +context that were set up in ``->iomap_begin``. +For example, a write might wish to commit the reservations for the bytes +that were operated upon and unreserve any space that was not operated +upon. +``written`` might be zero if no bytes were touched. +``flags`` will contain the same value passed to ``->iomap_begin``. +iomap ops for reads are not likely to need to supply this function. + +Both functions should return a negative errno code on error, or zero on +success. + +Preparing for File Operations +============================= + +iomap only handles mapping and I/O. +Filesystems must still call out to the VFS to check input parameters +and file state before initiating an I/O operation. +It does not handle obtaining filesystem freeze protection, updating of +timestamps, stripping privileges, or access control. + +Locking Hierarchy +================= + +iomap requires that filesystems supply their own locking model. +There are three categories of synchronization primitives, as far as +iomap is concerned: + + * The **upper** level primitive is provided by the filesystem to + coordinate access to different iomap operations. + The exact primitive is specifc to the filesystem and operation, + but is often a VFS inode, pagecache invalidation, or folio lock. + For example, a filesystem might take ``i_rwsem`` before calling + ``iomap_file_buffered_write`` and ``iomap_file_unshare`` to prevent + these two file operations from clobbering each other. + Pagecache writeback may lock a folio to prevent other threads from + accessing the folio until writeback is underway. + + * The **lower** level primitive is taken by the filesystem in the + ``->iomap_begin`` and ``->iomap_end`` functions to coordinate + access to the file space mapping information. + The fields of the iomap object should be filled out while holding + this primitive. + The upper level synchronization primitive, if any, remains held + while acquiring the lower level synchronization primitive. + For example, XFS takes ``ILOCK_EXCL`` and ext4 takes ``i_data_sem`` + while sampling mappings. + Filesystems with immutable mapping information may not require + synchronization here. + + * The **operation** primitive is taken by an iomap operation to + coordinate access to its own internal data structures. + The upper level synchronization primitive, if any, remains held + while acquiring this primitive. + The lower level primitive is not held while acquiring this + primitive. + For example, pagecache write operations will obtain a file mapping, + then grab and lock a folio to copy new contents. + It may also lock an internal folio state object to update metadata. + +The exact locking requirements are specific to the filesystem; for +certain operations, some of these locks can be elided. +All further mention of locking are *recommendations*, not mandates. +Each filesystem author must figure out the locking for themself. + +Bugs and Limitations +==================== + + * No support for fscrypt. + * No support for compression. + * No support for fsverity yet. + * Strong assumptions that IO should work the way it does on XFS. + * Does iomap *actually* work for non-regular file data? + +Patches welcome! diff --git a/Documentation/filesystems/iomap/index.rst b/Documentation/filesystems/iomap/index.rst new file mode 100644 index 000000000000..3c6a52440250 --- /dev/null +++ b/Documentation/filesystems/iomap/index.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +VFS iomap Documentation +======================= + +.. toctree:: + :maxdepth: 2 + :numbered: + + design + operations + porting diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst new file mode 100644 index 000000000000..8e6c721d2330 --- /dev/null +++ b/Documentation/filesystems/iomap/operations.rst @@ -0,0 +1,713 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. _iomap_operations: + +.. + Dumb style notes to maintain the author's sanity: + Please try to start sentences on separate lines so that + sentence changes don't bleed colors in diff. + Heading decorations are documented in sphinx.rst. + +========================= +Supported File Operations +========================= + +.. contents:: Table of Contents + :local: + +Below are a discussion of the high level file operations that iomap +implements. + +Buffered I/O +============ + +Buffered I/O is the default file I/O path in Linux. +File contents are cached in memory ("pagecache") to satisfy reads and +writes. +Dirty cache will be written back to disk at some point that can be +forced via ``fsync`` and variants. + +iomap implements nearly all the folio and pagecache management that +filesystems have to implement themselves under the legacy I/O model. +This means that the filesystem need not know the details of allocating, +mapping, managing uptodate and dirty state, or writeback of pagecache +folios. +Under the legacy I/O model, this was managed very inefficiently with +linked lists of buffer heads instead of the per-folio bitmaps that iomap +uses. +Unless the filesystem explicitly opts in to buffer heads, they will not +be used, which makes buffered I/O much more efficient, and the pagecache +maintainer much happier. + +``struct address_space_operations`` +----------------------------------- + +The following iomap functions can be referenced directly from the +address space operations structure: + + * ``iomap_dirty_folio`` + * ``iomap_release_folio`` + * ``iomap_invalidate_folio`` + * ``iomap_is_partially_uptodate`` + +The following address space operations can be wrapped easily: + + * ``read_folio`` + * ``readahead`` + * ``writepages`` + * ``bmap`` + * ``swap_activate`` + +``struct iomap_folio_ops`` +-------------------------- + +The ``->iomap_begin`` function for pagecache operations may set the +``struct iomap::folio_ops`` field to an ops structure to override +default behaviors of iomap: + +.. code-block:: c + + struct iomap_folio_ops { + struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos, + unsigned len); + void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied, + struct folio *folio); + bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); + }; + +iomap calls these functions: + + - ``get_folio``: Called to allocate and return an active reference to + a locked folio prior to starting a write. + If this function is not provided, iomap will call + ``iomap_get_folio``. + This could be used to `set up per-folio filesystem state + <https://lore.kernel.org/all/[email protected]/>`_ + for a write. + + - ``put_folio``: Called to unlock and put a folio after a pagecache + operation completes. + If this function is not provided, iomap will ``folio_unlock`` and + ``folio_put`` on its own. + This could be used to `commit per-folio filesystem state + <https://lore.kernel.org/all/[email protected]/>`_ + that was set up by ``->get_folio``. + + - ``iomap_valid``: The filesystem may not hold locks between + ``->iomap_begin`` and ``->iomap_end`` because pagecache operations + can take folio locks, fault on userspace pages, initiate writeback + for memory reclamation, or engage in other time-consuming actions. + If a file's space mapping data are mutable, it is possible that the + mapping for a particular pagecache folio can `change in the time it + takes + <https://lore.kernel.org/all/[email protected]/>`_ + to allocate, install, and lock that folio. + + For the pagecache, races can happen if writeback doesn't take + ``i_rwsem`` or ``invalidate_lock`` and updates mapping information. + Races can also happen if the filesytem allows concurrent writes. + For such files, the mapping *must* be revalidated after the folio + lock has been taken so that iomap can manage the folio correctly. + + fsdax does not need this revalidation because there's no writeback + and no support for unwritten extents. + + Filesystems subject to this kind of race must provide a + ``->iomap_valid`` function to decide if the mapping is still valid. + If the mapping is not valid, the mapping will be sampled again. + + To support making the validity decision, the filesystem's + ``->iomap_begin`` function may set ``struct iomap::validity_cookie`` + at the same time that it populates the other iomap fields. + A simple validation cookie implementation is a sequence counter. + If the filesystem bumps the sequence counter every time it modifies + the inode's extent map, it can be placed in the ``struct + iomap::validity_cookie`` during ``->iomap_begin``. + If the value in the cookie is found to be different to the value + the filesystem holds when the mapping is passed back to + ``->iomap_valid``, then the iomap should considered stale and the + validation failed. + +These ``struct kiocb`` flags are significant for buffered I/O with iomap: + + * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``. + +Internal per-Folio State +------------------------ + +If the fsblock size matches the size of a pagecache folio, it is assumed +that all disk I/O operations will operate on the entire folio. +The uptodate (memory contents are at least as new as what's on disk) and +dirty (memory contents are newer than what's on disk) status of the +folio are all that's needed for this case. + +If the fsblock size is less than the size of a pagecache folio, iomap +tracks the per-fsblock uptodate and dirty state itself. +This enables iomap to handle both "bs < ps" `filesystems +<https://lore.kernel.org/all/[email protected]/>`_ +and large folios in the pagecache. + +iomap internally tracks two state bits per fsblock: + + * ``uptodate``: iomap will try to keep folios fully up to date. + If there are read(ahead) errors, those fsblocks will not be marked + uptodate. + The folio itself will be marked uptodate when all fsblocks within the + folio are uptodate. + + * ``dirty``: iomap will set the per-block dirty state when programs + write to the file. + The folio itself will be marked dirty when any fsblock within the + folio is dirty. + +iomap also tracks the amount of read and write disk IOs that are in +flight. +This structure is much lighter weight than ``struct buffer_head`` +because there is only one per folio, and the per-fsblock overhead is two +bits vs. 104 bytes. + +Filesystems wishing to turn on large folios in the pagecache should call +``mapping_set_large_folios`` when initializing the incore inode. + +Buffered Readahead and Reads +---------------------------- + +The ``iomap_readahead`` function initiates readahead to the pagecache. +The ``iomap_read_folio`` function reads one folio's worth of data into +the pagecache. +The ``flags`` argument to ``->iomap_begin`` will be set to zero. +The pagecache takes whatever locks it needs before calling the +filesystem. + +Buffered Writes +--------------- + +The ``iomap_file_buffered_write`` function writes an ``iocb`` to the +pagecache. +``IOMAP_WRITE`` or ``IOMAP_WRITE`` | ``IOMAP_NOWAIT`` will be passed as +the ``flags`` argument to ``->iomap_begin``. +Callers commonly take ``i_rwsem`` in either shared or exclusive mode +before calling this function. + +mmap Write Faults +~~~~~~~~~~~~~~~~~ + +The ``iomap_page_mkwrite`` function handles a write fault to a folio in +the pagecache. +``IOMAP_WRITE | IOMAP_FAULT`` will be passed as the ``flags`` argument +to ``->iomap_begin``. +Callers commonly take the mmap ``invalidate_lock`` in shared or +exclusive mode before calling this function. + +Buffered Write Failures +~~~~~~~~~~~~~~~~~~~~~~~ + +After a short write to the pagecache, the areas not written will not +become marked dirty. +The filesystem must arrange to `cancel +<https://lore.kernel.org/all/[email protected]/>`_ +such `reservations +<https://lore.kernel.org/linux-xfs/[email protected]/>`_ +because writeback will not consume the reservation. +The ``iomap_file_buffered_write_punch_delalloc`` can be called from a +``->iomap_end`` function to find all the clean areas of the folios +caching a fresh (``IOMAP_F_NEW``) delalloc mapping. +It takes the ``invalidate_lock``. + +The filesystem must supply a function ``punch`` to be called for +each file range in this state. +This function must *only* remove delayed allocation reservations, in +case another thread racing with the current thread writes successfully +to the same region and triggers writeback to flush the dirty data out to +disk. + +Zeroing for File Operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Filesystems can call ``iomap_zero_range`` to perform zeroing of the +pagecache for non-truncation file operations that are not aligned to +the fsblock size. +``IOMAP_ZERO`` will be passed as the ``flags`` argument to +``->iomap_begin``. +Callers typically hold ``i_rwsem`` and ``invalidate_lock`` in exclusive +mode before calling this function. + +Unsharing Reflinked File Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Filesystems can call ``iomap_file_unshare`` to force a file sharing +storage with another file to preemptively copy the shared data to newly +allocate storage. +``IOMAP_WRITE | IOMAP_UNSHARE`` will be passed as the ``flags`` argument +to ``->iomap_begin``. +Callers typically hold ``i_rwsem`` and ``invalidate_lock`` in exclusive +mode before calling this function. + +Truncation +---------- + +Filesystems can call ``iomap_truncate_page`` to zero the bytes in the +pagecache from EOF to the end of the fsblock during a file truncation +operation. +``truncate_setsize`` or ``truncate_pagecache`` will take care of +everything after the EOF block. +``IOMAP_ZERO`` will be passed as the ``flags`` argument to +``->iomap_begin``. +Callers typically hold ``i_rwsem`` and ``invalidate_lock`` in exclusive +mode before calling this function. + +Pagecache Writeback +------------------- + +Filesystems can call ``iomap_writepages`` to respond to a request to +write dirty pagecache folios to disk. +The ``mapping`` and ``wbc`` parameters should be passed unchanged. +The ``wpc`` pointer should be allocated by the filesystem and must +be initialized to zero. + +The pagecache will lock each folio before trying to schedule it for +writeback. +It does not lock ``i_rwsem`` or ``invalidate_lock``. + +The dirty bit will be cleared for all folios run through the +``->map_blocks`` machinery described below even if the writeback fails. +This is to prevent dirty folio clots when storage devices fail; an +``-EIO`` is recorded for userspace to collect via ``fsync``. + +The ``ops`` structure must be specified and is as follows: + +``struct iomap_writeback_ops`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: c + + struct iomap_writeback_ops { + int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, + loff_t offset, unsigned len); + int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + void (*discard_folio)(struct folio *folio, loff_t pos); + }; + +The fields are as follows: + + - ``map_blocks``: Sets ``wpc->iomap`` to the space mapping of the file + range (in bytes) given by ``offset`` and ``len``. + iomap calls this function for each dirty fs block in each dirty folio, + though it will `reuse mappings + <https://lore.kernel.org/all/[email protected]/>`_ + for runs of contiguous dirty fsblocks within a folio. + Do not return ``IOMAP_INLINE`` mappings here; the ``->iomap_end`` + function must deal with persisting written data. + Do not return ``IOMAP_DELALLOC`` mappings here; iomap currently + requires mapping to allocated space. + Filesystems can skip a potentially expensive mapping lookup if the + mappings have not changed. + This revalidation must be open-coded by the filesystem; it is + unclear if ``iomap::validity_cookie`` can be reused for this + purpose. + This function must be supplied by the filesystem. + + - ``prepare_ioend``: Enables filesystems to transform the writeback + ioend or perform any other preparatory work before the writeback I/O + is submitted. + This might include pre-write space accounting updates, or installing + a custom ``->bi_end_io`` function for internal purposes, such as + deferring the ioend completion to a workqueue to run metadata update + transactions from process context. + This function is optional. + + - ``discard_folio``: iomap calls this function after ``->map_blocks`` + fails to schedule I/O for any part of a dirty folio. + The function should throw away any reservations that may have been + made for the write. + The folio will be marked clean and an ``-EIO`` recorded in the + pagecache. + Filesystems can use this callback to `remove + <https://lore.kernel.org/all/[email protected]/>`_ + delalloc reservations to avoid having delalloc reservations for + clean pagecache. + This function is optional. + +Pagecache Writeback Completion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To handle the bookkeeping that must happen after disk I/O for writeback +completes, iomap creates chains of ``struct iomap_ioend`` objects that +wrap the ``bio`` that is used to write pagecache data to disk. +By default, iomap finishes writeback ioends by clearing the writeback +bit on the folios attached to the ``ioend``. +If the write failed, it will also set the error bits on the folios and +the address space. +This can happen in interrupt or process context, depending on the +storage device. + +Filesystems that need to update internal bookkeeping (e.g. unwritten +extent conversions) should provide a ``->prepare_ioend`` function to +set ``struct iomap_end::bio::bi_end_io`` to its own function. +This function should call ``iomap_finish_ioends`` after finishing its +own work (e.g. unwritten extent conversion). + +Some filesystems may wish to `amortize the cost of running metadata +transactions +<https://lore.kernel.org/all/[email protected]/>`_ +for post-writeback updates by batching them. +They may also require transactions to run from process context, which +implies punting batches to a workqueue. +iomap ioends contain a ``list_head`` to enable batching. + +Given a batch of ioends, iomap has a few helpers to assist with +amortization: + + * ``iomap_sort_ioends``: Sort all the ioends in the list by file + offset. + + * ``iomap_ioend_try_merge``: Given an ioend that is not in any list and + a separate list of sorted ioends, merge as many of the ioends from + the head of the list into the given ioend. + ioends can only be merged if the file range and storage addresses are + contiguous; the unwritten and shared status are the same; and the + write I/O outcome is the same. + The merged ioends become their own list. + + * ``iomap_finish_ioends``: Finish an ioend that possibly has other + ioends linked to it. + +Direct I/O +========== + +In Linux, direct I/O is defined as file I/O that is issued directly to +storage, bypassing the pagecache. +The ``iomap_dio_rw`` function implements O_DIRECT (direct I/O) reads and +writes for files. + +.. code-block:: c + + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + const struct iomap_ops *ops, + const struct iomap_dio_ops *dops, + unsigned int dio_flags, void *private, + size_t done_before); + +The filesystem can provide the ``dops`` parameter if it needs to perform +extra work before or after the I/O is issued to storage. +The ``done_before`` parameter tells the how much of the request has +already been transferred. +It is used to continue a request asynchronously when `part of the +request +<https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c03098d4b9ad76bca2966a8769dcfe59f7f85103>`_ +has already been completed synchronously. + +The ``done_before`` parameter should be set if writes for the ``iocb`` +have been initiated prior to the call. +The direction of the I/O is determined from the ``iocb`` passed in. + +The ``dio_flags`` argument can be set to any combination of the +following values: + + * ``IOMAP_DIO_FORCE_WAIT``: Wait for the I/O to complete even if the + kiocb is not synchronous. + + * ``IOMAP_DIO_OVERWRITE_ONLY``: Perform a pure overwrite for this range + or fail with ``-EAGAIN``. + This can be used by filesystems with complex unaligned I/O + write paths to provide an optimised fast path for unaligned writes. + If a pure overwrite can be performed, then serialisation against + other I/Os to the same filesystem block(s) is unnecessary as there is + no risk of stale data exposure or data loss. + If a pure overwrite cannot be performed, then the filesystem can + perform the serialisation steps needed to provide exclusive access + to the unaligned I/O range so that it can perform allocation and + sub-block zeroing safely. + Filesystems can use this flag to try to reduce locking contention, + but a lot of `detailed checking + <https://lore.kernel.org/linux-ext4/[email protected]/>`_ + is required to do it `correctly + <https://lore.kernel.org/linux-ext4/[email protected]/>`_. + + * ``IOMAP_DIO_PARTIAL``: If a page fault occurs, return whatever + progress has already been made. + The caller may deal with the page fault and retry the operation. + If the caller decides to retry the operation, it should pass the + accumulated return values of all previous calls as the + ``done_before`` parameter to the next call. + +These ``struct kiocb`` flags are significant for direct I/O with iomap: + + * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``. + + * ``IOCB_SYNC``: Ensure that the device has persisted data to disk + before completing the call. + In the case of pure overwrites, the I/O may be issued with FUA + enabled. + + * ``IOCB_HIPRI``: Poll for I/O completion instead of waiting for an + interrupt. + Only meaningful for asynchronous I/O, and only if the entire I/O can + be issued as a single ``struct bio``. + + * ``IOCB_DIO_CALLER_COMP``: Try to run I/O completion from the caller's + process context. + See ``linux/fs.h`` for more details. + +Filesystems should call ``iomap_dio_rw`` from ``->read_iter`` and +``->write_iter``, and set ``FMODE_CAN_ODIRECT`` in the ``->open`` +function for the file. +They should not set ``->direct_IO``, which is deprecated. + +If a filesystem wishes to perform its own work before direct I/O +completion, it should call ``__iomap_dio_rw``. +If its return value is not an error pointer or a NULL pointer, the +filesystem should pass the return value to ``iomap_dio_complete`` after +finishing its internal work. + +Return Values +------------- + +``iomap_dio_rw`` can return one of the following: + + * A non-negative number of bytes transferred. + + * ``-ENOTBLK``: Fall back to buffered I/O. + iomap itself will return this value if it cannot invalidate the page + cache before issuing the I/O to storage. + The ``->iomap_begin`` or ``->iomap_end`` functions may also return + this value. + + * ``-EIOCBQUEUED``: The asynchronous direct I/O request has been + queued and will be completed separately. + + * Any of the other negative error codes. + +Direct Reads +------------ + +A direct I/O read initiates a read I/O from the storage device to the +caller's buffer. +Dirty parts of the pagecache are flushed to storage before initiating +the read io. +The ``flags`` value for ``->iomap_begin`` will be ``IOMAP_DIRECT`` with +any combination of the following enhancements: + + * ``IOMAP_NOWAIT``, as defined previously. + +Callers commonly hold ``i_rwsem`` in shared mode before calling this +function. + +Direct Writes +------------- + +A direct I/O write initiates a write I/O to the storage device from the +caller's buffer. +Dirty parts of the pagecache are flushed to storage before initiating +the write io. +The pagecache is invalidated both before and after the write io. +The ``flags`` value for ``->iomap_begin`` will be ``IOMAP_DIRECT | +IOMAP_WRITE`` with any combination of the following enhancements: + + * ``IOMAP_NOWAIT``, as defined previously. + + * ``IOMAP_OVERWRITE_ONLY``: Allocating blocks and zeroing partial + blocks is not allowed. + The entire file range must map to a single written or unwritten + extent. + The file I/O range must be aligned to the filesystem block size + if the mapping is unwritten and the filesystem cannot handle zeroing + the unaligned regions without exposing stale contents. + +Callers commonly hold ``i_rwsem`` in shared or exclusive mode before +calling this function. + +``struct iomap_dio_ops:`` +------------------------- +.. code-block:: c + + struct iomap_dio_ops { + void (*submit_io)(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset); + int (*end_io)(struct kiocb *iocb, ssize_t size, int error, + unsigned flags); + struct bio_set *bio_set; + }; + +The fields of this structure are as follows: + + - ``submit_io``: iomap calls this function when it has constructed a + ``struct bio`` object for the I/O requested, and wishes to submit it + to the block device. + If no function is provided, ``submit_bio`` will be called directly. + Filesystems that would like to perform additional work before (e.g. + data replication for btrfs) should implement this function. + + - ``end_io``: This is called after the ``struct bio`` completes. + This function should perform post-write conversions of unwritten + extent mappings, handle write failures, etc. + The ``flags`` argument may be set to a combination of the following: + + * ``IOMAP_DIO_UNWRITTEN``: The mapping was unwritten, so the ioend + should mark the extent as written. + + * ``IOMAP_DIO_COW``: Writing to the space in the mapping required a + copy on write operation, so the ioend should switch mappings. + + - ``bio_set``: This allows the filesystem to provide a custom bio_set + for allocating direct I/O bios. + This enables filesystems to `stash additional per-bio information + <https://lore.kernel.org/all/[email protected]/>`_ + for private use. + If this field is NULL, generic ``struct bio`` objects will be used. + +Filesystems that want to perform extra work after an I/O completion +should set a custom ``->bi_end_io`` function via ``->submit_io``. +Afterwards, the custom endio function must call +``iomap_dio_bio_end_io`` to finish the direct I/O. + +DAX I/O +======= + +Some storage devices can be directly mapped as memory. +These devices support a new access mode known as "fsdax" that allows +loads and stores through the CPU and memory controller. + +fsdax Reads +----------- + +A fsdax read performs a memcpy from storage device to the caller's +buffer. +The ``flags`` value for ``->iomap_begin`` will be ``IOMAP_DAX`` with any +combination of the following enhancements: + + * ``IOMAP_NOWAIT``, as defined previously. + +Callers commonly hold ``i_rwsem`` in shared mode before calling this +function. + +fsdax Writes +------------ + +A fsdax write initiates a memcpy to the storage device from the caller's +buffer. +The ``flags`` value for ``->iomap_begin`` will be ``IOMAP_DAX | +IOMAP_WRITE`` with any combination of the following enhancements: + + * ``IOMAP_NOWAIT``, as defined previously. + + * ``IOMAP_OVERWRITE_ONLY``: The caller requires a pure overwrite to be + performed from this mapping. + This requires the filesystem extent mapping to already exist as an + ``IOMAP_MAPPED`` type and span the entire range of the write I/O + request. + If the filesystem cannot map this request in a way that allows the + iomap infrastructure to perform a pure overwrite, it must fail the + mapping operation with ``-EAGAIN``. + +Callers commonly hold ``i_rwsem`` in exclusive mode before calling this +function. + +fsdax mmap Faults +~~~~~~~~~~~~~~~~~ + +The ``dax_iomap_fault`` function handles read and write faults to fsdax +storage. +For a read fault, ``IOMAP_DAX | IOMAP_FAULT`` will be passed as the +``flags`` argument to ``->iomap_begin``. +For a write fault, ``IOMAP_DAX | IOMAP_FAULT | IOMAP_WRITE`` will be +passed as the ``flags`` argument to ``->iomap_begin``. + +Callers commonly hold the same locks as they do to call their iomap +pagecache counterparts. + +fsdax Truncation, fallocate, and Unsharing +------------------------------------------ + +For fsdax files, the following functions are provided to replace their +iomap pagecache I/O counterparts. +The ``flags`` argument to ``->iomap_begin`` are the same as the +pagecache counterparts, with ``IOMAP_DAX`` added. + + * ``dax_file_unshare`` + * ``dax_zero_range`` + * ``dax_truncate_page`` + +Callers commonly hold the same locks as they do to call their iomap +pagecache counterparts. + +fsdax Deduplication +------------------- + +Filesystems implementing the ``FIDEDUPERANGE`` ioctl must call the +``dax_remap_file_range_prep`` function with their own iomap read ops. + +Seeking Files +============= + +iomap implements the two iterating whence modes of the ``llseek`` system +call. + +SEEK_DATA +--------- + +The ``iomap_seek_data`` function implements the SEEK_DATA "whence" value +for llseek. +``IOMAP_REPORT`` will be passed as the ``flags`` argument to +``->iomap_begin``. + +For unwritten mappings, the pagecache will be searched. +Regions of the pagecache with a folio mapped and uptodate fsblocks +within those folios will be reported as data areas. + +Callers commonly hold ``i_rwsem`` in shared mode before calling this +function. + +SEEK_HOLE +--------- + +The ``iomap_seek_hole`` function implements the SEEK_HOLE "whence" value +for llseek. +``IOMAP_REPORT`` will be passed as the ``flags`` argument to +``->iomap_begin``. + +For unwritten mappings, the pagecache will be searched. +Regions of the pagecache with no folio mapped, or a !uptodate fsblock +within a folio will be reported as sparse hole areas. + +Callers commonly hold ``i_rwsem`` in shared mode before calling this +function. + +Swap File Activation +==================== + +The ``iomap_swapfile_activate`` function finds all the base-page aligned +regions in a file and sets them up as swap space. +The file will be ``fsync()``'d before activation. +``IOMAP_REPORT`` will be passed as the ``flags`` argument to +``->iomap_begin``. +All mappings must be mapped or unwritten; cannot be dirty or shared, and +cannot span multiple block devices. +Callers must hold ``i_rwsem`` in exclusive mode; this is already +provided by ``swapon``. + +File Space Mapping Reporting +============================ + +iomap implements two of the file space mapping system calls. + +FS_IOC_FIEMAP +------------- + +The ``iomap_fiemap`` function exports file extent mappings to userspace +in the format specified by the ``FS_IOC_FIEMAP`` ioctl. +``IOMAP_REPORT`` will be passed as the ``flags`` argument to +``->iomap_begin``. +Callers commonly hold ``i_rwsem`` in shared mode before calling this +function. + +FIBMAP (deprecated) +------------------- + +``iomap_bmap`` implements FIBMAP. +The calling conventions are the same as for FIEMAP. +This function is only provided to maintain compatibility for filesystems +that implemented FIBMAP prior to conversion. +This ioctl is deprecated; do **not** add a FIBMAP implementation to +filesystems that do not have it. +Callers should probably hold ``i_rwsem`` in shared mode before calling +this function, but this is unclear. diff --git a/Documentation/filesystems/iomap/porting.rst b/Documentation/filesystems/iomap/porting.rst new file mode 100644 index 000000000000..3d49a32c0fff --- /dev/null +++ b/Documentation/filesystems/iomap/porting.rst @@ -0,0 +1,120 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. _iomap_porting: + +.. + Dumb style notes to maintain the author's sanity: + Please try to start sentences on separate lines so that + sentence changes don't bleed colors in diff. + Heading decorations are documented in sphinx.rst. + +======================= +Porting Your Filesystem +======================= + +.. contents:: Table of Contents + :local: + +Why Convert? +============ + +There are several reasons to convert a filesystem to iomap: + + 1. The classic Linux I/O path is not terribly efficient. + Pagecache operations lock a single base page at a time and then call + into the filesystem to return a mapping for only that page. + Direct I/O operations build I/O requests a single file block at a + time. + This worked well enough for direct/indirect-mapped filesystems such + as ext2, but is very inefficient for extent-based filesystems such + as XFS. + + 2. Large folios are only supported via iomap; there are no plans to + convert the old buffer_head path to use them. + + 3. Direct access to storage on memory-like devices (fsdax) is only + supported via iomap. + + 4. Lower maintenance overhead for individual filesystem maintainers. + iomap handles common pagecache related operations itself, such as + allocating, instantiating, locking, and unlocking of folios. + No ->write_begin(), ->write_end() or direct_IO + address_space_operations are required to be implemented by + filesystem using iomap. + +How Do I Convert a Filesystem? +============================== + +First, add ``#include <linux/iomap.h>`` from your source code and add +``select FS_IOMAP`` to your filesystem's Kconfig option. +Build the kernel, run fstests with the ``-g all`` option across a wide +variety of your filesystem's supported configurations to build a +baseline of which tests pass and which ones fail. + +The recommended approach is first to implement ``->iomap_begin`` (and +``->iomap_end`` if necessary) to allow iomap to obtain a read-only +mapping of a file range. +In most cases, this is a relatively trivial conversion of the existing +``get_block()`` function for read-only mappings. +``FS_IOC_FIEMAP`` is a good first target because it is trivial to +implement support for it and then to determine that the extent map +iteration is correct from userspace. +If FIEMAP is returning the correct information, it's a good sign that +other read-only mapping operations will do the right thing. + +Next, modify the filesystem's ``get_block(create = false)`` +implementation to use the new ``->iomap_begin`` implementation to map +file space for selected read operations. +Hide behind a debugging knob the ability to switch on the iomap mapping +functions for selected call paths. +It is necessary to write some code to fill out the bufferhead-based +mapping information from the ``iomap`` structure, but the new functions +can be tested without needing to implement any iomap APIs. + +Once the read-only functions are working like this, convert each high +level file operation one by one to use iomap native APIs instead of +going through ``get_block()``. +Done one at a time, regressions should be self evident. +You *do* have a regression test baseline for fstests, right? +It is suggested to convert swap file activation, ``SEEK_DATA``, and +``SEEK_HOLE`` before tackling the I/O paths. +A likely complexity at this point will be converting the buffered read +I/O path because of bufferheads. +The buffered read I/O paths doesn't need to be converted yet, though the +direct I/O read path should be converted in this phase. + +At this point, you should look over your ``->iomap_begin`` function. +If it switches between large blocks of code based on dispatching of the +``flags`` argument, you should consider breaking it up into +per-operation iomap ops with smaller, more cohesive functions. +XFS is a good example of this. + +The next thing to do is implement ``get_blocks(create == true)`` +functionality in the ``->iomap_begin``/``->iomap_end`` methods. +It is strongly recommended to create separate mapping functions and +iomap ops for write operations. +Then convert the direct I/O write path to iomap, and start running fsx +w/ DIO enabled in earnest on filesystem. +This will flush out lots of data integrity corner case bugs that the new +write mapping implementation introduces. + +Now, convert any remaining file operations to call the iomap functions. +This will get the entire filesystem using the new mapping functions, and +they should largely be debugged and working correctly after this step. + +Most likely at this point, the buffered read and write paths will still +need to be converted. +The mapping functions should all work correctly, so all that needs to be +done is rewriting all the code that interfaces with bufferheads to +interface with iomap and folios. +It is much easier first to get regular file I/O (without any fancy +features like fscrypt, fsverity, compression, or data=journaling) +converted to use iomap. +Some of those fancy features (fscrypt and compression) aren't +implemented yet in iomap. +For unjournalled filesystems that use the pagecache for symbolic links +and directories, you might also try converting their handling to iomap. + +The rest is left as an exercise for the reader, as it will be different +for every filesystem. +If you encounter problems, email the people and lists in +``get_maintainers.pl`` for help. diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index 9aaf6ef75eb5..317934c9e8fc 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -645,6 +645,8 @@ The members are as follows: fs_param_is_blockdev Blockdev path * Needs lookup fs_param_is_path Path * Needs lookup fs_param_is_fd File descriptor result->int_32 + fs_param_is_uid User ID (u32) result->uid + fs_param_is_gid Group ID (u32) result->gid ======================= ======================= ===================== Note that if the value is of fs_param_is_bool type, fs_parse() will try @@ -678,6 +680,8 @@ The members are as follows: fsparam_bdev() fs_param_is_blockdev fsparam_path() fs_param_is_path fsparam_fd() fs_param_is_fd + fsparam_uid() fs_param_is_uid + fsparam_gid() fs_param_is_gid ======================= =============================================== all of which take two arguments, name string and option number - for @@ -784,8 +788,9 @@ process the parameters it is given. option number (which it returns). If successful, and if the parameter type indicates the result is a - boolean, integer or enum type, the value is converted by this function and - the result stored in result->{boolean,int_32,uint_32,uint_64}. + boolean, integer, enum, uid, or gid type, the value is converted by this + function and the result stored in + result->{boolean,int_32,uint_32,uint_64,uid,gid}. If a match isn't initially made, the key is prefixed with "no" and no value is present then an attempt will be made to look up the key with the diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst index 9232cd7da301..5d0b68f752c0 100644 --- a/Documentation/networking/devlink/devlink-region.rst +++ b/Documentation/networking/devlink/devlink-region.rst @@ -49,7 +49,7 @@ example usage $ devlink region show [ DEV/REGION ] $ devlink region del DEV/REGION snapshot SNAPSHOT_ID $ devlink region dump DEV/REGION [ snapshot SNAPSHOT_ID ] - $ devlink region read DEV/REGION [ snapshot SNAPSHOT_ID ] address ADDRESS length length + $ devlink region read DEV/REGION [ snapshot SNAPSHOT_ID ] address ADDRESS length LENGTH # Show all of the exposed regions with region sizes: $ devlink region show diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index a141e8e65c5d..9a97030c6c8d 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -186,6 +186,7 @@ Code Seq# Include File Comments 'Q' all linux/soundcard.h 'R' 00-1F linux/random.h conflict! 'R' 01 linux/rfkill.h conflict! +'R' 20-2F linux/trace_mmap.h 'R' C0-DF net/bluetooth/rfcomm.h 'R' E0 uapi/linux/fsl_mc.h 'S' all linux/cdrom.h conflict! diff --git a/MAINTAINERS b/MAINTAINERS index 3c4fdf74a3f9..9cae268966e7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -846,12 +846,6 @@ ALPS PS/2 TOUCHPAD DRIVER R: Pali Rohár <[email protected]> F: drivers/input/mouse/alps.* -ALTERA I2C CONTROLLER DRIVER -M: Thor Thayer <[email protected]> -S: Maintained -F: Documentation/devicetree/bindings/i2c/i2c-altera.txt -F: drivers/i2c/busses/i2c-altera.c - ALTERA MAILBOX DRIVER M: Mun Yew Tham <[email protected]> S: Maintained @@ -871,21 +865,6 @@ L: [email protected] S: Maintained F: drivers/gpio/gpio-altera.c -ALTERA SYSTEM MANAGER DRIVER -M: Thor Thayer <[email protected]> -S: Maintained -F: drivers/mfd/altera-sysmgr.c -F: include/linux/mfd/altera-sysmgr.h - -ALTERA SYSTEM RESOURCE DRIVER FOR ARRIA10 DEVKIT -M: Thor Thayer <[email protected]> -S: Maintained -F: drivers/gpio/gpio-altera-a10sr.c -F: drivers/mfd/altera-a10sr.c -F: drivers/reset/reset-a10sr.c -F: include/dt-bindings/reset/altr,rst-mgr-a10sr.h -F: include/linux/mfd/altera-a10sr.h - ALTERA TRIPLE SPEED ETHERNET DRIVER M: Joyce Ooi <[email protected]> @@ -2892,7 +2871,7 @@ F: drivers/edac/altera_edac.[ch] ARM/SPREADTRUM SoC SUPPORT M: Orson Zhai <[email protected]> M: Baolin Wang <[email protected]> -M: Chunyan Zhang <[email protected]> +R: Chunyan Zhang <[email protected]> S: Maintained F: arch/arm64/boot/dts/sprd N: sprd @@ -3601,10 +3580,9 @@ W: https://wireless.wiki.kernel.org/en/users/Drivers/b43 F: drivers/net/wireless/broadcom/b43/ B43LEGACY WIRELESS DRIVER -M: Larry Finger <[email protected]> -S: Maintained +S: Orphan W: https://wireless.wiki.kernel.org/en/users/Drivers/b43 F: drivers/net/wireless/broadcom/b43legacy/ @@ -3781,6 +3759,20 @@ F: include/linux/blk* F: kernel/trace/blktrace.c F: lib/sbitmap.c +BLOCK LAYER DEVICE DRIVER API [RUST] +M: Andreas Hindborg <[email protected]> +R: Boqun Feng <[email protected]> +S: Supported +W: https://rust-for-linux.com +B: https://github.com/Rust-for-Linux/linux/issues +C: https://rust-for-linux.zulipchat.com/#narrow/stream/Block +T: git https://github.com/Rust-for-Linux/linux.git rust-block-next +F: drivers/block/rnull.rs +F: rust/kernel/block.rs +F: rust/kernel/block/ + BLOCK2MTD DRIVER M: Joern Engel <[email protected]> @@ -5536,6 +5528,7 @@ CONTROL GROUP (CGROUP) M: Tejun Heo <[email protected]> M: Zefan Li <[email protected]> M: Johannes Weiner <[email protected]> +M: Michal Koutný <[email protected]> S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git @@ -6239,9 +6232,8 @@ S: Maintained F: drivers/usb/dwc3/ DESIGNWARE XDATA IP DRIVER -M: Gustavo Pimentel <[email protected]> -S: Maintained +S: Orphan F: Documentation/misc-devices/dw-xdata-pcie.rst F: drivers/misc/dw-xdata-pcie.c @@ -8483,6 +8475,7 @@ R: Darrick J. Wong <[email protected]> S: Supported +F: Documentation/filesystems/iomap/* F: fs/iomap/ F: include/linux/iomap.h @@ -8834,6 +8827,7 @@ F: drivers/spi/spi-fsl-qspi.c FREESCALE QUICC ENGINE LIBRARY M: Qiang Zhao <[email protected]> +M: Christophe Leroy <[email protected]> S: Maintained F: drivers/soc/fsl/qe/ @@ -8883,9 +8877,10 @@ S: Maintained F: drivers/tty/serial/ucc_uart.c FREESCALE SOC DRIVERS +M: Christophe Leroy <[email protected]> L: [email protected] (moderated for non-subscribers) -S: Orphan +S: Maintained F: Documentation/devicetree/bindings/misc/fsl,dpaa2-console.yaml F: Documentation/devicetree/bindings/soc/fsl/ F: drivers/soc/fsl/ @@ -11571,7 +11566,7 @@ F: include/linux/iosys-map.h IO_URING M: Jens Axboe <[email protected]> -R: Pavel Begunkov <[email protected]> +M: Pavel Begunkov <[email protected]> S: Maintained T: git git://git.kernel.dk/linux-block @@ -14474,7 +14469,7 @@ MEMORY MAPPING M: Andrew Morton <[email protected]> R: Liam R. Howlett <[email protected]> R: Vlastimil Babka <[email protected]> -R: Lorenzo Stoakes <[email protected]> +R: Lorenzo Stoakes <[email protected]> S: Maintained W: http://www.linux-mm.org @@ -16447,7 +16442,7 @@ F: arch/arm/boot/dts/ti/omap/am335x-nano.dts OMAP1 SUPPORT M: Aaro Koskinen <[email protected]> M: Janusz Krzysztofik <[email protected]> -M: Tony Lindgren <[email protected]> +R: Tony Lindgren <[email protected]> S: Maintained Q: http://patchwork.kernel.org/project/linux-omap/list/ @@ -16459,10 +16454,13 @@ F: include/linux/platform_data/ams-delta-fiq.h F: include/linux/platform_data/i2c-omap.h OMAP2+ SUPPORT +M: Aaro Koskinen <[email protected]> +M: Andreas Kemnade <[email protected]> +M: Kevin Hilman <[email protected]> +M: Roger Quadros <[email protected]> M: Tony Lindgren <[email protected]> S: Maintained -W: http://www.muru.com/linux/omap/ W: http://linux.omap.com/ Q: http://patchwork.kernel.org/project/linux-omap/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git @@ -18374,7 +18372,7 @@ M: Jeff Johnson <[email protected]> S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath12k -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ath/ath.git F: drivers/net/wireless/ath/ath12k/ N: ath12k @@ -18384,7 +18382,7 @@ M: Jeff Johnson <[email protected]> S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath10k -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ath/ath.git F: drivers/net/wireless/ath/ath10k/ N: ath10k @@ -18395,7 +18393,7 @@ L: [email protected] S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath11k B: https://wireless.wiki.kernel.org/en/users/Drivers/ath11k/bugreport -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ath/ath.git F: drivers/net/wireless/ath/ath11k/ N: ath11k @@ -18404,7 +18402,7 @@ M: Toke Høiland-Jørgensen <[email protected]> S: Maintained W: https://wireless.wiki.kernel.org/en/users/Drivers/ath9k -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ath/ath.git F: Documentation/devicetree/bindings/net/wireless/qca,ath9k.yaml F: drivers/net/wireless/ath/ath9k/ @@ -18866,6 +18864,7 @@ M: Neeraj Upadhyay <[email protected]> (kernel/rcu/tasks.h) M: Joel Fernandes <[email protected]> M: Josh Triplett <[email protected]> M: Boqun Feng <[email protected]> +M: Uladzislau Rezki <[email protected]> R: Steven Rostedt <[email protected]> R: Mathieu Desnoyers <[email protected]> R: Lai Jiangshan <[email protected]> @@ -19315,7 +19314,7 @@ F: drivers/perf/riscv_pmu_legacy.c F: drivers/perf/riscv_pmu_sbi.c RISC-V THEAD SoC SUPPORT -M: Jisheng Zhang <[email protected]> +M: Drew Fustini <[email protected]> M: Guo Ren <[email protected]> M: Fu Wei <[email protected]> @@ -19509,7 +19508,6 @@ F: drivers/net/wireless/realtek/rtl818x/rtl8180/ RTL8187 WIRELESS DRIVER M: Hin-Tak Leung <[email protected]> -M: Larry Finger <[email protected]> S: Maintained T: git https://github.com/pkshih/rtw.git @@ -21247,7 +21245,6 @@ W: http://wiki.laptop.org/go/DCON F: drivers/staging/olpc_dcon/ STAGING - REALTEK RTL8712U DRIVERS -M: Larry Finger <[email protected]> M: Florian Schilhabel <[email protected]>. S: Odd Fixes F: drivers/staging/rtl8712/ @@ -23863,8 +23860,8 @@ S: Maintained F: drivers/vhost/scsi.c VIRTIO I2C DRIVER -M: Conghui Chen <[email protected]> M: Viresh Kumar <[email protected]> +R: "Chen, Jian Jun" <[email protected]> S: Maintained @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = NAME = Baby Opossum Posse # *DOCUMENTATION* diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ee5115252aac..a867a7d967aa 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -34,6 +34,7 @@ config ARM select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 + select ARCH_NEED_CMPXCHG_1_EMU if CPU_V6 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_CFI_CLANG select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE diff --git a/arch/arm/boot/dts/rockchip/rk3066a.dtsi b/arch/arm/boot/dts/rockchip/rk3066a.dtsi index 30139f21de64..15cbd94d7ec0 100644 --- a/arch/arm/boot/dts/rockchip/rk3066a.dtsi +++ b/arch/arm/boot/dts/rockchip/rk3066a.dtsi @@ -128,6 +128,7 @@ pinctrl-0 = <&hdmii2c_xfer>, <&hdmi_hpd>; power-domains = <&power RK3066_PD_VIO>; rockchip,grf = <&grf>; + #sound-dai-cells = <0>; status = "disabled"; ports { diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h index 44667bdb4707..9beb64d30586 100644 --- a/arch/arm/include/asm/cmpxchg.h +++ b/arch/arm/include/asm/cmpxchg.h @@ -5,6 +5,7 @@ #include <linux/irqflags.h> #include <linux/prefetch.h> #include <asm/barrier.h> +#include <linux/cmpxchg-emu.h> #if defined(CONFIG_CPU_SA1100) || defined(CONFIG_CPU_SA110) /* @@ -162,7 +163,11 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, prefetchw((const void *)ptr); switch (size) { -#ifndef CONFIG_CPU_V6 /* min ARCH >= ARMv6K */ +#ifdef CONFIG_CPU_V6 /* ARCH == ARMv6 */ + case 1: + oldval = cmpxchg_emu_u8((volatile u8 *)ptr, old, new); + break; +#else /* min ARCH > ARMv6 */ case 1: do { asm volatile("@ __cmpxchg1\n" diff --git a/arch/arm/mach-davinci/pm.c b/arch/arm/mach-davinci/pm.c index 8aa39db095d7..2c5155bd376b 100644 --- a/arch/arm/mach-davinci/pm.c +++ b/arch/arm/mach-davinci/pm.c @@ -61,7 +61,7 @@ static void davinci_pm_suspend(void) /* Configure sleep count in deep sleep register */ val = __raw_readl(pm_config.deepsleep_reg); - val &= ~DEEPSLEEP_SLEEPCOUNT_MASK, + val &= ~DEEPSLEEP_SLEEPCOUNT_MASK; val |= pm_config.sleepcount; __raw_writel(val, pm_config.deepsleep_reg); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5d91259ee7b5..b6e8920364de 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1649,6 +1649,7 @@ config RODATA_FULL_DEFAULT_ENABLED config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + depends on !KCSAN help Enabling this option prevents the kernel from accessing user-space memory directly by pointing TTBR0_EL1 to a reserved diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts b/arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts index c204dd43c726..ce90327e1b2e 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h64-remix-mini-pc.dts @@ -191,7 +191,7 @@ compatible = "x-powers,axp803"; reg = <0x3a3>; interrupt-parent = <&r_intc>; - interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_LOW>; + interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_LOW>; x-powers,drive-vbus-en; vin1-supply = <®_vcc5v>; diff --git a/arch/arm64/boot/dts/qcom/qdu1000.dtsi b/arch/arm64/boot/dts/qcom/qdu1000.dtsi index f2a5e2e40461..f90f03fa6a24 100644 --- a/arch/arm64/boot/dts/qcom/qdu1000.dtsi +++ b/arch/arm64/boot/dts/qcom/qdu1000.dtsi @@ -1459,9 +1459,23 @@ system-cache-controller@19200000 { compatible = "qcom,qdu1000-llcc"; - reg = <0 0x19200000 0 0xd80000>, + reg = <0 0x19200000 0 0x80000>, + <0 0x19300000 0 0x80000>, + <0 0x19600000 0 0x80000>, + <0 0x19700000 0 0x80000>, + <0 0x19a00000 0 0x80000>, + <0 0x19b00000 0 0x80000>, + <0 0x19e00000 0 0x80000>, + <0 0x19f00000 0 0x80000>, <0 0x1a200000 0 0x80000>; reg-names = "llcc0_base", + "llcc1_base", + "llcc2_base", + "llcc3_base", + "llcc4_base", + "llcc5_base", + "llcc6_base", + "llcc7_base", "llcc_broadcast_base"; interrupts = <GIC_SPI 266 IRQ_TYPE_LEVEL_HIGH>; }; diff --git a/arch/arm64/boot/dts/qcom/sa8775p.dtsi b/arch/arm64/boot/dts/qcom/sa8775p.dtsi index 31de73594839..1b3dc0ece54d 100644 --- a/arch/arm64/boot/dts/qcom/sa8775p.dtsi +++ b/arch/arm64/boot/dts/qcom/sa8775p.dtsi @@ -3605,7 +3605,7 @@ interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>, <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>, <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>, - <GIC_PPI 12 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>; + <GIC_PPI 10 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>; }; pcie0: pcie@1c00000 { diff --git a/arch/arm64/boot/dts/qcom/sc8180x.dtsi b/arch/arm64/boot/dts/qcom/sc8180x.dtsi index 067712310560..581a70c34fd2 100644 --- a/arch/arm64/boot/dts/qcom/sc8180x.dtsi +++ b/arch/arm64/boot/dts/qcom/sc8180x.dtsi @@ -2647,11 +2647,14 @@ system-cache-controller@9200000 { compatible = "qcom,sc8180x-llcc"; - reg = <0 0x09200000 0 0x50000>, <0 0x09280000 0 0x50000>, - <0 0x09300000 0 0x50000>, <0 0x09380000 0 0x50000>, - <0 0x09600000 0 0x50000>; + reg = <0 0x09200000 0 0x58000>, <0 0x09280000 0 0x58000>, + <0 0x09300000 0 0x58000>, <0 0x09380000 0 0x58000>, + <0 0x09400000 0 0x58000>, <0 0x09480000 0 0x58000>, + <0 0x09500000 0 0x58000>, <0 0x09580000 0 0x58000>, + <0 0x09600000 0 0x58000>; reg-names = "llcc0_base", "llcc1_base", "llcc2_base", - "llcc3_base", "llcc_broadcast_base"; + "llcc3_base", "llcc4_base", "llcc5_base", + "llcc6_base", "llcc7_base", "llcc_broadcast_base"; interrupts = <GIC_SPI 582 IRQ_TYPE_LEVEL_HIGH>; }; diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts b/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts index 41215567b3ae..372b35fb844f 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts +++ b/arch/arm64/boot/dts/qcom/sc8280xp-crd.dts @@ -977,8 +977,7 @@ reset-n-pins { pins = "gpio99"; function = "gpio"; - output-high; - drive-strength = <16>; + bias-disable; }; }; diff --git a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts index e937732abede..4bf99b6b6e5f 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts +++ b/arch/arm64/boot/dts/qcom/sc8280xp-lenovo-thinkpad-x13s.dts @@ -655,15 +655,16 @@ status = "okay"; - /* FIXME: verify */ touchscreen@10 { - compatible = "hid-over-i2c"; + compatible = "elan,ekth5015m", "elan,ekth6915"; reg = <0x10>; - hid-descr-addr = <0x1>; interrupts-extended = <&tlmm 175 IRQ_TYPE_LEVEL_LOW>; - vdd-supply = <&vreg_misc_3p3>; - vddl-supply = <&vreg_s10b>; + reset-gpios = <&tlmm 99 (GPIO_ACTIVE_LOW | GPIO_OPEN_DRAIN)>; + no-reset-on-power-off; + + vcc33-supply = <&vreg_misc_3p3>; + vccio-supply = <&vreg_misc_3p3>; pinctrl-names = "default"; pinctrl-0 = <&ts0_default>; @@ -1496,8 +1497,8 @@ reset-n-pins { pins = "gpio99"; function = "gpio"; - output-high; - drive-strength = <16>; + drive-strength = <2>; + bias-disable; }; }; diff --git a/arch/arm64/boot/dts/qcom/sc8280xp.dtsi b/arch/arm64/boot/dts/qcom/sc8280xp.dtsi index 0549ba1fbeea..59f0a850671a 100644 --- a/arch/arm64/boot/dts/qcom/sc8280xp.dtsi +++ b/arch/arm64/boot/dts/qcom/sc8280xp.dtsi @@ -4623,6 +4623,8 @@ restart@c264000 { compatible = "qcom,pshold"; reg = <0 0x0c264000 0 0x4>; + /* TZ seems to block access */ + status = "reserved"; }; tsens1: thermal-sensor@c265000 { diff --git a/arch/arm64/boot/dts/qcom/sm6115.dtsi b/arch/arm64/boot/dts/qcom/sm6115.dtsi index aca0a87092e4..9ed062150aaf 100644 --- a/arch/arm64/boot/dts/qcom/sm6115.dtsi +++ b/arch/arm64/boot/dts/qcom/sm6115.dtsi @@ -1090,6 +1090,7 @@ power-domains = <&rpmpd SM6115_VDDCX>; operating-points-v2 = <&sdhc1_opp_table>; + iommus = <&apps_smmu 0x00c0 0x0>; interconnects = <&system_noc MASTER_SDCC_1 RPM_ALWAYS_TAG &bimc SLAVE_EBI_CH0 RPM_ALWAYS_TAG>, <&bimc MASTER_AMPSS_M0 RPM_ALWAYS_TAG diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index c5c2895b37c7..be6b1e7d07ce 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -49,6 +49,15 @@ stdout-path = "serial0:115200n8"; }; + reserved-memory { + linux,cma { + compatible = "shared-dma-pool"; + size = <0x0 0x8000000>; + reusable; + linux,cma-default; + }; + }; + sound { compatible = "qcom,x1e80100-sndcard"; model = "X1E80100-CRD"; @@ -93,7 +102,7 @@ }; codec { - sound-dai = <&wcd938x 1>, <&swr2 0>, <&lpass_txmacro 0>; + sound-dai = <&wcd938x 1>, <&swr2 1>, <&lpass_txmacro 0>; }; platform { @@ -744,7 +753,7 @@ wcd_tx: codec@0,3 { compatible = "sdw20217010d00"; reg = <0 3>; - qcom,tx-port-mapping = <1 1 2 3>; + qcom,tx-port-mapping = <2 2 3 4>; }; }; diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts index 2061fbe7b75a..8f67c393b871 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts @@ -23,6 +23,15 @@ stdout-path = "serial0:115200n8"; }; + reserved-memory { + linux,cma { + compatible = "shared-dma-pool"; + size = <0x0 0x8000000>; + reusable; + linux,cma-default; + }; + }; + vph_pwr: vph-pwr-regulator { compatible = "regulator-fixed"; diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 5f90a0b3c016..05e4d491ec18 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -2737,15 +2737,17 @@ device_type = "pci"; compatible = "qcom,pcie-x1e80100"; reg = <0 0x01bf8000 0 0x3000>, - <0 0x70000000 0 0xf1d>, - <0 0x70000f20 0 0xa8>, + <0 0x70000000 0 0xf20>, + <0 0x70000f40 0 0xa8>, <0 0x70001000 0 0x1000>, - <0 0x70100000 0 0x100000>; + <0 0x70100000 0 0x100000>, + <0 0x01bfb000 0 0x1000>; reg-names = "parf", "dbi", "elbi", "atu", - "config"; + "config", + "mhi"; #address-cells = <3>; #size-cells = <2>; ranges = <0x01000000 0 0x00000000 0 0x70200000 0 0x100000>, diff --git a/arch/arm64/boot/dts/rockchip/rk3308-rock-pi-s.dts b/arch/arm64/boot/dts/rockchip/rk3308-rock-pi-s.dts index b47fe02c33fb..079101cddd65 100644 --- a/arch/arm64/boot/dts/rockchip/rk3308-rock-pi-s.dts +++ b/arch/arm64/boot/dts/rockchip/rk3308-rock-pi-s.dts @@ -5,6 +5,8 @@ */ /dts-v1/; + +#include <dt-bindings/leds/common.h> #include "rk3308.dtsi" / { @@ -24,17 +26,21 @@ leds { compatible = "gpio-leds"; pinctrl-names = "default"; - pinctrl-0 = <&green_led_gio>, <&heartbeat_led_gpio>; + pinctrl-0 = <&green_led>, <&heartbeat_led>; green-led { + color = <LED_COLOR_ID_GREEN>; default-state = "on"; + function = LED_FUNCTION_POWER; gpios = <&gpio0 RK_PA6 GPIO_ACTIVE_HIGH>; label = "rockpis:green:power"; linux,default-trigger = "default-on"; }; blue-led { + color = <LED_COLOR_ID_BLUE>; default-state = "on"; + function = LED_FUNCTION_HEARTBEAT; gpios = <&gpio0 RK_PA5 GPIO_ACTIVE_HIGH>; label = "rockpis:blue:user"; linux,default-trigger = "heartbeat"; @@ -126,10 +132,12 @@ }; &emmc { - bus-width = <4>; cap-mmc-highspeed; - mmc-hs200-1_8v; + cap-sd-highspeed; + no-sdio; non-removable; + pinctrl-names = "default"; + pinctrl-0 = <&emmc_bus8 &emmc_clk &emmc_cmd>; vmmc-supply = <&vcc_io>; status = "okay"; }; @@ -214,11 +222,11 @@ pinctrl-0 = <&rtc_32k>; leds { - green_led_gio: green-led-gpio { + green_led: green-led { rockchip,pins = <0 RK_PA6 RK_FUNC_GPIO &pcfg_pull_none>; }; - heartbeat_led_gpio: heartbeat-led-gpio { + heartbeat_led: heartbeat-led { rockchip,pins = <0 RK_PA5 RK_FUNC_GPIO &pcfg_pull_none>; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3308.dtsi b/arch/arm64/boot/dts/rockchip/rk3308.dtsi index 962ea893999b..c00da150a22f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3308.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3308.dtsi @@ -811,7 +811,7 @@ clocks = <&cru SCLK_I2S2_8CH_TX_OUT>, <&cru SCLK_I2S2_8CH_RX_OUT>, <&cru PCLK_ACODEC>; - reset-names = "codec-reset"; + reset-names = "codec"; resets = <&cru SRST_ACODEC_P>; #sound-dai-cells = <0>; status = "disabled"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts index f09d60bbe6c4..a608a219543e 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock-pi-e.dts @@ -241,8 +241,8 @@ rk805: pmic@18 { compatible = "rockchip,rk805"; reg = <0x18>; - interrupt-parent = <&gpio2>; - interrupts = <6 IRQ_TYPE_LEVEL_LOW>; + interrupt-parent = <&gpio0>; + interrupts = <2 IRQ_TYPE_LEVEL_LOW>; #clock-cells = <1>; clock-output-names = "xin32k", "rk805-clkout2"; gpio-controller; diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index 734f87db4d11..73618df7a889 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -793,6 +793,7 @@ dma-names = "tx"; pinctrl-names = "default"; pinctrl-0 = <&spdif_tx>; + #sound-dai-cells = <0>; status = "disabled"; }; @@ -804,6 +805,7 @@ clocks = <&cru SCLK_I2S_2CH>, <&cru HCLK_I2S_2CH>; dmas = <&dmac_bus 6>, <&dmac_bus 7>; dma-names = "tx", "rx"; + #sound-dai-cells = <0>; status = "disabled"; }; @@ -817,6 +819,7 @@ dma-names = "tx", "rx"; pinctrl-names = "default"; pinctrl-0 = <&i2s_8ch_bus>; + #sound-dai-cells = <0>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi index 789fd0dcc88b..3cd63d1e8f15 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi @@ -450,7 +450,7 @@ ap_i2c_audio: &i2c8 { dlg,btn-cfg = <50>; dlg,mic-det-thr = <500>; dlg,jack-ins-deb = <20>; - dlg,jack-det-rate = "32ms_64ms"; + dlg,jack-det-rate = "32_64"; dlg,jack-rem-deb = <1>; dlg,a-d-btn-thr = <0xa>; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts index 26322a358d91..b908ce006c26 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-quartz64-b.dts @@ -289,7 +289,7 @@ regulator-name = "vdd_gpu"; regulator-always-on; regulator-boot-on; - regulator-min-microvolt = <900000>; + regulator-min-microvolt = <500000>; regulator-max-microvolt = <1350000>; regulator-ramp-delay = <6001>; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5-plus.dts b/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5-plus.dts index 1a604429fb26..e74871491ef5 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5-plus.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-orangepi-5-plus.dts @@ -444,6 +444,7 @@ &sdmmc { bus-width = <4>; cap-sd-highspeed; + cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>; disable-wp; max-frequency = <150000000>; no-sdio; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-quartzpro64.dts b/arch/arm64/boot/dts/rockchip/rk3588-quartzpro64.dts index b4f22d95ac0e..e80caa36f8e4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-quartzpro64.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-quartzpro64.dts @@ -435,6 +435,7 @@ &sdmmc { bus-width = <4>; cap-sd-highspeed; + cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>; disable-wp; max-frequency = <150000000>; no-sdio; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts index b8e15b76a8a6..2e7512676b7e 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts @@ -383,6 +383,7 @@ bus-width = <4>; cap-mmc-highspeed; cap-sd-highspeed; + cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>; disable-wp; sd-uhs-sdr104; vmmc-supply = <&vcc_3v3_s3>; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi index aebe1fedd2d8..615094bb8ba3 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi @@ -344,6 +344,11 @@ }; }; +&pwm0 { + pinctrl-0 = <&pwm0m1_pins>; + pinctrl-names = "default"; +}; + &saradc { vref-supply = <&vcc_1v8_s0>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts index 3b2ec1d0c542..074c316a9a69 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588s-coolpi-4b.dts @@ -288,9 +288,9 @@ pinctrl-0 = <&i2c7m0_xfer>; status = "okay"; - es8316: audio-codec@11 { + es8316: audio-codec@10 { compatible = "everest,es8316"; - reg = <0x11>; + reg = <0x10>; assigned-clocks = <&cru I2S0_8CH_MCLKOUT>; assigned-clock-rates = <12288000>; clocks = <&cru I2S0_8CH_MCLKOUT>; diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-rock-5a.dts b/arch/arm64/boot/dts/rockchip/rk3588s-rock-5a.dts index 8e2a07612d17..3b9a349362db 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-rock-5a.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588s-rock-5a.dts @@ -366,6 +366,7 @@ bus-width = <4>; cap-mmc-highspeed; cap-sd-highspeed; + cd-gpios = <&gpio0 RK_PA4 GPIO_ACTIVE_LOW>; disable-wp; max-frequency = <150000000>; no-sdio; @@ -393,6 +394,7 @@ pinctrl-0 = <&pmic_pins>, <&rk806_dvs1_null>, <&rk806_dvs2_null>, <&rk806_dvs3_null>; spi-max-frequency = <1000000>; + system-power-controller; vcc1-supply = <&vcc5v0_sys>; vcc2-supply = <&vcc5v0_sys>; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 57a9abe78ee4..2c7bf4da0b80 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1036,6 +1036,7 @@ CONFIG_SND_AUDIO_GRAPH_CARD2=m CONFIG_HID_MULTITOUCH=m CONFIG_I2C_HID_ACPI=m CONFIG_I2C_HID_OF=m +CONFIG_I2C_HID_OF_ELAN=m CONFIG_USB=y CONFIG_USB_OTG=y CONFIG_USB_XHCI_HCD=y diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h index 980d1dd8e1a3..b8a5861dc7b7 100644 --- a/arch/arm64/include/asm/asm-extable.h +++ b/arch/arm64/include/asm/asm-extable.h @@ -112,6 +112,9 @@ #define _ASM_EXTABLE_KACCESS_ERR(insn, fixup, err) \ _ASM_EXTABLE_KACCESS_ERR_ZERO(insn, fixup, err, wzr) +#define _ASM_EXTABLE_KACCESS(insn, fixup) \ + _ASM_EXTABLE_KACCESS_ERR_ZERO(insn, fixup, wzr, wzr) + #define _ASM_EXTABLE_LOAD_UNALIGNED_ZEROPAD(insn, fixup, data, addr) \ __DEFINE_ASM_GPR_NUMS \ __ASM_EXTABLE_RAW(#insn, #fixup, \ diff --git a/arch/arm64/include/asm/runtime-const.h b/arch/arm64/include/asm/runtime-const.h new file mode 100644 index 000000000000..be5915669d23 --- /dev/null +++ b/arch/arm64/include/asm/runtime-const.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RUNTIME_CONST_H +#define _ASM_RUNTIME_CONST_H + +#include <asm/cacheflush.h> + +/* Sigh. You can still run arm64 in BE mode */ +#include <asm/byteorder.h> + +#define runtime_const_ptr(sym) ({ \ + typeof(sym) __ret; \ + asm_inline("1:\t" \ + "movz %0, #0xcdef\n\t" \ + "movk %0, #0x89ab, lsl #16\n\t" \ + "movk %0, #0x4567, lsl #32\n\t" \ + "movk %0, #0x0123, lsl #48\n\t" \ + ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \ + ".long 1b - .\n\t" \ + ".popsection" \ + :"=r" (__ret)); \ + __ret; }) + +#define runtime_const_shift_right_32(val, sym) ({ \ + unsigned long __ret; \ + asm_inline("1:\t" \ + "lsr %w0,%w1,#12\n\t" \ + ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \ + ".long 1b - .\n\t" \ + ".popsection" \ + :"=r" (__ret) \ + :"r" (0u+(val))); \ + __ret; }) + +#define runtime_const_init(type, sym) do { \ + extern s32 __start_runtime_##type##_##sym[]; \ + extern s32 __stop_runtime_##type##_##sym[]; \ + runtime_const_fixup(__runtime_fixup_##type, \ + (unsigned long)(sym), \ + __start_runtime_##type##_##sym, \ + __stop_runtime_##type##_##sym); \ +} while (0) + +/* 16-bit immediate for wide move (movz and movk) in bits 5..20 */ +static inline void __runtime_fixup_16(__le32 *p, unsigned int val) +{ + u32 insn = le32_to_cpu(*p); + insn &= 0xffe0001f; + insn |= (val & 0xffff) << 5; + *p = cpu_to_le32(insn); +} + +static inline void __runtime_fixup_caches(void *where, unsigned int insns) +{ + unsigned long va = (unsigned long)where; + caches_clean_inval_pou(va, va + 4*insns); +} + +static inline void __runtime_fixup_ptr(void *where, unsigned long val) +{ + __le32 *p = lm_alias(where); + __runtime_fixup_16(p, val); + __runtime_fixup_16(p+1, val >> 16); + __runtime_fixup_16(p+2, val >> 32); + __runtime_fixup_16(p+3, val >> 48); + __runtime_fixup_caches(where, 4); +} + +/* Immediate value is 6 bits starting at bit #16 */ +static inline void __runtime_fixup_shift(void *where, unsigned long val) +{ + __le32 *p = lm_alias(where); + u32 insn = le32_to_cpu(*p); + insn &= 0xffc0ffff; + insn |= (val & 63) << 16; + *p = cpu_to_le32(insn); + __runtime_fixup_caches(where, 1); +} + +static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), + unsigned long val, s32 *start, s32 *end) +{ + while (start < end) { + fn(*start + (void *)start, val); + start++; + } +} + +#endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 14be5000c5a0..28f665e0975a 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -184,29 +184,40 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) * The "__xxx_error" versions set the third argument to -EFAULT if an error * occurs, and leave it unchanged on success. */ -#define __get_mem_asm(load, reg, x, addr, err, type) \ +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#define __get_mem_asm(load, reg, x, addr, label, type) \ + asm_goto_output( \ + "1: " load " " reg "0, [%1]\n" \ + _ASM_EXTABLE_##type##ACCESS_ERR(1b, %l2, %w0) \ + : "=r" (x) \ + : "r" (addr) : : label) +#else +#define __get_mem_asm(load, reg, x, addr, label, type) do { \ + int __gma_err = 0; \ asm volatile( \ "1: " load " " reg "1, [%2]\n" \ "2:\n" \ _ASM_EXTABLE_##type##ACCESS_ERR_ZERO(1b, 2b, %w0, %w1) \ - : "+r" (err), "=r" (x) \ - : "r" (addr)) + : "+r" (__gma_err), "=r" (x) \ + : "r" (addr)); \ + if (__gma_err) goto label; } while (0) +#endif -#define __raw_get_mem(ldr, x, ptr, err, type) \ +#define __raw_get_mem(ldr, x, ptr, label, type) \ do { \ unsigned long __gu_val; \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_mem_asm(ldr "b", "%w", __gu_val, (ptr), (err), type); \ + __get_mem_asm(ldr "b", "%w", __gu_val, (ptr), label, type); \ break; \ case 2: \ - __get_mem_asm(ldr "h", "%w", __gu_val, (ptr), (err), type); \ + __get_mem_asm(ldr "h", "%w", __gu_val, (ptr), label, type); \ break; \ case 4: \ - __get_mem_asm(ldr, "%w", __gu_val, (ptr), (err), type); \ + __get_mem_asm(ldr, "%w", __gu_val, (ptr), label, type); \ break; \ case 8: \ - __get_mem_asm(ldr, "%x", __gu_val, (ptr), (err), type); \ + __get_mem_asm(ldr, "%x", __gu_val, (ptr), label, type); \ break; \ default: \ BUILD_BUG(); \ @@ -219,27 +230,34 @@ do { \ * uaccess_ttbr0_disable(). As `x` and `ptr` could contain blocking functions, * we must evaluate these outside of the critical section. */ -#define __raw_get_user(x, ptr, err) \ +#define __raw_get_user(x, ptr, label) \ do { \ __typeof__(*(ptr)) __user *__rgu_ptr = (ptr); \ __typeof__(x) __rgu_val; \ __chk_user_ptr(ptr); \ - \ - uaccess_ttbr0_enable(); \ - __raw_get_mem("ldtr", __rgu_val, __rgu_ptr, err, U); \ - uaccess_ttbr0_disable(); \ - \ - (x) = __rgu_val; \ + do { \ + __label__ __rgu_failed; \ + uaccess_ttbr0_enable(); \ + __raw_get_mem("ldtr", __rgu_val, __rgu_ptr, __rgu_failed, U); \ + uaccess_ttbr0_disable(); \ + (x) = __rgu_val; \ + break; \ + __rgu_failed: \ + uaccess_ttbr0_disable(); \ + goto label; \ + } while (0); \ } while (0) #define __get_user_error(x, ptr, err) \ do { \ + __label__ __gu_failed; \ __typeof__(*(ptr)) __user *__p = (ptr); \ might_fault(); \ if (access_ok(__p, sizeof(*__p))) { \ __p = uaccess_mask_ptr(__p); \ - __raw_get_user((x), __p, (err)); \ + __raw_get_user((x), __p, __gu_failed); \ } else { \ + __gu_failed: \ (x) = (__force __typeof__(x))0; (err) = -EFAULT; \ } \ } while (0) @@ -262,40 +280,42 @@ do { \ do { \ __typeof__(dst) __gkn_dst = (dst); \ __typeof__(src) __gkn_src = (src); \ - int __gkn_err = 0; \ - \ - __mte_enable_tco_async(); \ - __raw_get_mem("ldr", *((type *)(__gkn_dst)), \ - (__force type *)(__gkn_src), __gkn_err, K); \ - __mte_disable_tco_async(); \ + do { \ + __label__ __gkn_label; \ \ - if (unlikely(__gkn_err)) \ + __mte_enable_tco_async(); \ + __raw_get_mem("ldr", *((type *)(__gkn_dst)), \ + (__force type *)(__gkn_src), __gkn_label, K); \ + __mte_disable_tco_async(); \ + break; \ + __gkn_label: \ + __mte_disable_tco_async(); \ goto err_label; \ + } while (0); \ } while (0) -#define __put_mem_asm(store, reg, x, addr, err, type) \ - asm volatile( \ - "1: " store " " reg "1, [%2]\n" \ +#define __put_mem_asm(store, reg, x, addr, label, type) \ + asm goto( \ + "1: " store " " reg "0, [%1]\n" \ "2:\n" \ - _ASM_EXTABLE_##type##ACCESS_ERR(1b, 2b, %w0) \ - : "+r" (err) \ - : "rZ" (x), "r" (addr)) + _ASM_EXTABLE_##type##ACCESS(1b, %l2) \ + : : "rZ" (x), "r" (addr) : : label) -#define __raw_put_mem(str, x, ptr, err, type) \ +#define __raw_put_mem(str, x, ptr, label, type) \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ switch (sizeof(*(ptr))) { \ case 1: \ - __put_mem_asm(str "b", "%w", __pu_val, (ptr), (err), type); \ + __put_mem_asm(str "b", "%w", __pu_val, (ptr), label, type); \ break; \ case 2: \ - __put_mem_asm(str "h", "%w", __pu_val, (ptr), (err), type); \ + __put_mem_asm(str "h", "%w", __pu_val, (ptr), label, type); \ break; \ case 4: \ - __put_mem_asm(str, "%w", __pu_val, (ptr), (err), type); \ + __put_mem_asm(str, "%w", __pu_val, (ptr), label, type); \ break; \ case 8: \ - __put_mem_asm(str, "%x", __pu_val, (ptr), (err), type); \ + __put_mem_asm(str, "%x", __pu_val, (ptr), label, type); \ break; \ default: \ BUILD_BUG(); \ @@ -307,25 +327,34 @@ do { \ * uaccess_ttbr0_disable(). As `x` and `ptr` could contain blocking functions, * we must evaluate these outside of the critical section. */ -#define __raw_put_user(x, ptr, err) \ +#define __raw_put_user(x, ptr, label) \ do { \ + __label__ __rpu_failed; \ __typeof__(*(ptr)) __user *__rpu_ptr = (ptr); \ __typeof__(*(ptr)) __rpu_val = (x); \ __chk_user_ptr(__rpu_ptr); \ \ - uaccess_ttbr0_enable(); \ - __raw_put_mem("sttr", __rpu_val, __rpu_ptr, err, U); \ - uaccess_ttbr0_disable(); \ + do { \ + uaccess_ttbr0_enable(); \ + __raw_put_mem("sttr", __rpu_val, __rpu_ptr, __rpu_failed, U); \ + uaccess_ttbr0_disable(); \ + break; \ + __rpu_failed: \ + uaccess_ttbr0_disable(); \ + goto label; \ + } while (0); \ } while (0) #define __put_user_error(x, ptr, err) \ do { \ + __label__ __pu_failed; \ __typeof__(*(ptr)) __user *__p = (ptr); \ might_fault(); \ if (access_ok(__p, sizeof(*__p))) { \ __p = uaccess_mask_ptr(__p); \ - __raw_put_user((x), __p, (err)); \ + __raw_put_user((x), __p, __pu_failed); \ } else { \ + __pu_failed: \ (err) = -EFAULT; \ } \ } while (0) @@ -348,15 +377,18 @@ do { \ do { \ __typeof__(dst) __pkn_dst = (dst); \ __typeof__(src) __pkn_src = (src); \ - int __pkn_err = 0; \ \ - __mte_enable_tco_async(); \ - __raw_put_mem("str", *((type *)(__pkn_src)), \ - (__force type *)(__pkn_dst), __pkn_err, K); \ - __mte_disable_tco_async(); \ - \ - if (unlikely(__pkn_err)) \ + do { \ + __label__ __pkn_err; \ + __mte_enable_tco_async(); \ + __raw_put_mem("str", *((type *)(__pkn_src)), \ + (__force type *)(__pkn_dst), __pkn_err, K); \ + __mte_disable_tco_async(); \ + break; \ + __pkn_err: \ + __mte_disable_tco_async(); \ goto err_label; \ + } while (0); \ } while(0) extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); @@ -381,6 +413,51 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi __actu_ret; \ }) +static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) +{ + if (unlikely(!access_ok(ptr,len))) + return 0; + uaccess_ttbr0_enable(); + return 1; +} +#define user_access_begin(a,b) user_access_begin(a,b) +#define user_access_end() uaccess_ttbr0_disable() +#define unsafe_put_user(x, ptr, label) \ + __raw_put_mem("sttr", x, uaccess_mask_ptr(ptr), label, U) +#define unsafe_get_user(x, ptr, label) \ + __raw_get_mem("ldtr", x, uaccess_mask_ptr(ptr), label, U) + +/* + * KCSAN uses these to save and restore ttbr state. + * We do not support KCSAN with ARM64_SW_TTBR0_PAN, so + * they are no-ops. + */ +static inline unsigned long user_access_save(void) { return 0; } +static inline void user_access_restore(unsigned long enabled) { } + +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +#define unsafe_copy_to_user(_dst,_src,_len,label) \ +do { \ + char __user *__ucu_dst = (_dst); \ + const char *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ +} while (0) + #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h index 14251abee23c..824ca6987a51 100644 --- a/arch/arm64/include/asm/word-at-a-time.h +++ b/arch/arm64/include/asm/word-at-a-time.h @@ -27,20 +27,15 @@ static inline unsigned long has_zero(unsigned long a, unsigned long *bits, } #define prep_zero_mask(a, bits, c) (bits) +#define create_zero_mask(bits) (bits) +#define find_zero(bits) (__ffs(bits) >> 3) -static inline unsigned long create_zero_mask(unsigned long bits) +static inline unsigned long zero_bytemask(unsigned long bits) { bits = (bits - 1) & ~bits; return bits >> 7; } -static inline unsigned long find_zero(unsigned long mask) -{ - return fls64(mask) >> 3; -} - -#define zero_bytemask(mask) (mask) - #else /* __AARCH64EB__ */ #include <asm-generic/word-at-a-time.h> #endif diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index dcdcccd40891..6174671be7c1 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -582,12 +582,9 @@ subsys_initcall(register_mte_tcf_preferred_sysctl); size_t mte_probe_user_range(const char __user *uaddr, size_t size) { const char __user *end = uaddr + size; - int err = 0; char val; - __raw_get_user(val, uaddr, err); - if (err) - return size; + __raw_get_user(val, uaddr, efault); uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE); while (uaddr < end) { @@ -595,12 +592,13 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size) * A read is sufficient for mte, the caller should have probed * for the pte write permission if required. */ - __raw_get_user(val, uaddr, err); - if (err) - return end - uaddr; + __raw_get_user(val, uaddr, efault); uaddr += MTE_GRANULE_SIZE; } (void)val; return 0; + +efault: + return end - uaddr; } diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 755a22d4f840..55a8e310ea12 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -264,6 +264,9 @@ SECTIONS EXIT_DATA } + RUNTIME_CONST(shift, d_hash_shift) + RUNTIME_CONST(ptr, dentry_hashtable) + PERCPU_SECTION(L1_CACHE_BYTES) HYPERVISOR_PERCPU_SECTION diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 642fb80c5c4e..874fe9588773 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -71,7 +71,7 @@ static void nfhd_submit_bio(struct bio *bio) len = bvec.bv_len; len >>= 9; nfhd_read_write(dev->id, 0, dir, sec >> shift, len >> shift, - page_to_phys(bvec.bv_page) + bvec.bv_offset); + bvec_phys(&bvec)); sec += len; } bio_endio(bio); @@ -98,6 +98,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) { struct queue_limits lim = { .logical_block_size = bsize, + .features = BLK_FEAT_ROTATIONAL, }; struct nfhd_device *dev; int dev_id = id - NFHD_DEV_OFFSET; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index d1030bc52564..d283d281d28e 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -849,6 +849,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) { struct eeh_dev *edev; struct pci_dev *pdev; + struct pci_bus *bus = NULL; if (pe->type & EEH_PE_PHB) return pe->phb->bus; @@ -859,9 +860,11 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) /* Retrieve the parent PCI bus of first (top) PCI device */ edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry); + pci_lock_rescan_remove(); pdev = eeh_dev_to_pci_dev(edev); if (pdev) - return pdev->bus; + bus = pdev->bus; + pci_unlock_rescan_remove(); - return NULL; + return bus; } diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 4690c219bfa4..63432a33ec49 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -647,8 +647,9 @@ __after_prom_start: * Note: This process overwrites the OF exception vectors. */ LOAD_REG_IMMEDIATE(r3, PAGE_OFFSET) - mr. r4,r26 /* In some cases the loader may */ - beq 9f /* have already put us at zero */ + mr r4,r26 /* Load the virtual source address into r4 */ + cmpld r3,r4 /* Check if source == dest */ + beq 9f /* If so skip the copy */ li r6,0x100 /* Start offset, the first 0x100 */ /* bytes were copied earlier. */ diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 85050be08a23..72b12bc10f90 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -27,6 +27,7 @@ #include <asm/paca.h> #include <asm/mmu.h> #include <asm/sections.h> /* _end */ +#include <asm/setup.h> #include <asm/smp.h> #include <asm/hw_breakpoint.h> #include <asm/svm.h> @@ -317,6 +318,16 @@ void default_machine_kexec(struct kimage *image) if (!kdump_in_progress()) kexec_prepare_cpus(); +#ifdef CONFIG_PPC_PSERIES + /* + * This must be done after other CPUs have shut down, otherwise they + * could execute the 'scv' instruction, which is not supported with + * reloc disabled (see configure_exceptions()). + */ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) + pseries_disable_reloc_on_exc(); +#endif + printk("kexec: Starting switchover sequence.\n"); /* switch to a staticly allocated stack. Based on irq stack code. diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index 096d09ed89f6..431be156ca9b 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -61,11 +61,3 @@ void pseries_kexec_cpu_down(int crash_shutdown, int secondary) } else xics_kexec_teardown_cpu(secondary); } - -void pseries_machine_kexec(struct kimage *image) -{ - if (firmware_has_feature(FW_FEATURE_SET_MODE)) - pseries_disable_reloc_on_exc(); - - default_machine_kexec(image); -} diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index bba4ad192b0f..3968a6970fa8 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -38,7 +38,6 @@ static inline void smp_init_pseries(void) { } #endif extern void pseries_kexec_cpu_down(int crash_shutdown, int secondary); -void pseries_machine_kexec(struct kimage *image); extern void pSeries_final_fixup(void); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 284a6fa04b0c..b10a25325238 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -343,8 +343,8 @@ static int alloc_dispatch_log_kmem_cache(void) { void (*ctor)(void *) = get_dtl_cache_ctor(); - dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, - DISPATCH_LOG_BYTES, 0, ctor); + dtl_cache = kmem_cache_create_usercopy("dtl", DISPATCH_LOG_BYTES, + DISPATCH_LOG_BYTES, 0, 0, DISPATCH_LOG_BYTES, ctor); if (!dtl_cache) { pr_warn("Failed to create dispatch trace log buffer cache\n"); pr_warn("Stolen time statistics will be unreliable\n"); @@ -1159,7 +1159,6 @@ define_machine(pseries) { .machine_check_exception = pSeries_machine_check_exception, .machine_check_log_err = pSeries_machine_check_log_err, #ifdef CONFIG_KEXEC_CORE - .machine_kexec = pseries_machine_kexec, .kexec_cpu_down = pseries_kexec_cpu_down, #endif #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/arch/riscv/boot/dts/canaan/canaan_kd233.dts b/arch/riscv/boot/dts/canaan/canaan_kd233.dts index 8df4cf3656f2..a7d753b6fdfd 100644 --- a/arch/riscv/boot/dts/canaan/canaan_kd233.dts +++ b/arch/riscv/boot/dts/canaan/canaan_kd233.dts @@ -15,6 +15,10 @@ model = "Kendryte KD233"; compatible = "canaan,kendryte-kd233", "canaan,kendryte-k210"; + aliases { + serial0 = &uarths0; + }; + chosen { bootargs = "earlycon console=ttySIF0"; stdout-path = "serial0:115200n8"; @@ -46,7 +50,6 @@ &fpioa { pinctrl-0 = <&jtag_pinctrl>; pinctrl-names = "default"; - status = "okay"; jtag_pinctrl: jtag-pinmux { pinmux = <K210_FPIOA(0, K210_PCF_JTAG_TCLK)>, @@ -118,6 +121,7 @@ #sound-dai-cells = <1>; pinctrl-0 = <&i2s0_pinctrl>; pinctrl-names = "default"; + status = "okay"; }; &spi0 { @@ -125,6 +129,7 @@ pinctrl-names = "default"; num-cs = <1>; cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + status = "okay"; panel@0 { compatible = "canaan,kd233-tft", "ilitek,ili9341"; diff --git a/arch/riscv/boot/dts/canaan/k210.dtsi b/arch/riscv/boot/dts/canaan/k210.dtsi index f87c5164d9cf..4f5d40fa1e77 100644 --- a/arch/riscv/boot/dts/canaan/k210.dtsi +++ b/arch/riscv/boot/dts/canaan/k210.dtsi @@ -16,13 +16,6 @@ #size-cells = <1>; compatible = "canaan,kendryte-k210"; - aliases { - serial0 = &uarths0; - serial1 = &uart1; - serial2 = &uart2; - serial3 = &uart3; - }; - /* * The K210 has an sv39 MMU following the privileged specification v1.9. * Since this is a non-ratified draft specification, the kernel does not @@ -137,6 +130,7 @@ reg = <0x38000000 0x1000>; interrupts = <33>; clocks = <&sysclk K210_CLK_CPU>; + status = "disabled"; }; gpio0: gpio-controller@38001000 { @@ -152,6 +146,7 @@ <62>, <63>, <64>, <65>; gpio-controller; ngpios = <32>; + status = "disabled"; }; dmac0: dma-controller@50000000 { @@ -187,6 +182,7 @@ <&sysclk K210_CLK_GPIO>; clock-names = "bus", "db"; resets = <&sysrst K210_RST_GPIO>; + status = "disabled"; gpio1_0: gpio-port@0 { #gpio-cells = <2>; @@ -214,6 +210,7 @@ dsr-override; cts-override; ri-override; + status = "disabled"; }; uart2: serial@50220000 { @@ -230,6 +227,7 @@ dsr-override; cts-override; ri-override; + status = "disabled"; }; uart3: serial@50230000 { @@ -246,6 +244,7 @@ dsr-override; cts-override; ri-override; + status = "disabled"; }; spi2: spi@50240000 { @@ -259,6 +258,7 @@ <&sysclk K210_CLK_APB0>; clock-names = "ssi_clk", "pclk"; resets = <&sysrst K210_RST_SPI2>; + status = "disabled"; }; i2s0: i2s@50250000 { @@ -268,6 +268,7 @@ clocks = <&sysclk K210_CLK_I2S0>; clock-names = "i2sclk"; resets = <&sysrst K210_RST_I2S0>; + status = "disabled"; }; i2s1: i2s@50260000 { @@ -277,6 +278,7 @@ clocks = <&sysclk K210_CLK_I2S1>; clock-names = "i2sclk"; resets = <&sysrst K210_RST_I2S1>; + status = "disabled"; }; i2s2: i2s@50270000 { @@ -286,6 +288,7 @@ clocks = <&sysclk K210_CLK_I2S2>; clock-names = "i2sclk"; resets = <&sysrst K210_RST_I2S2>; + status = "disabled"; }; i2c0: i2c@50280000 { @@ -296,6 +299,7 @@ <&sysclk K210_CLK_APB0>; clock-names = "ref", "pclk"; resets = <&sysrst K210_RST_I2C0>; + status = "disabled"; }; i2c1: i2c@50290000 { @@ -306,6 +310,7 @@ <&sysclk K210_CLK_APB0>; clock-names = "ref", "pclk"; resets = <&sysrst K210_RST_I2C1>; + status = "disabled"; }; i2c2: i2c@502a0000 { @@ -316,6 +321,7 @@ <&sysclk K210_CLK_APB0>; clock-names = "ref", "pclk"; resets = <&sysrst K210_RST_I2C2>; + status = "disabled"; }; fpioa: pinmux@502b0000 { @@ -464,6 +470,7 @@ reset-names = "spi"; num-cs = <4>; reg-io-width = <4>; + status = "disabled"; }; spi1: spi@53000000 { @@ -479,6 +486,7 @@ reset-names = "spi"; num-cs = <4>; reg-io-width = <4>; + status = "disabled"; }; spi3: spi@54000000 { @@ -495,6 +503,7 @@ num-cs = <4>; reg-io-width = <4>; + status = "disabled"; }; }; }; diff --git a/arch/riscv/boot/dts/canaan/k210_generic.dts b/arch/riscv/boot/dts/canaan/k210_generic.dts index 396c8ca4d24d..5734cc03753b 100644 --- a/arch/riscv/boot/dts/canaan/k210_generic.dts +++ b/arch/riscv/boot/dts/canaan/k210_generic.dts @@ -15,6 +15,10 @@ model = "Kendryte K210 generic"; compatible = "canaan,kendryte-k210"; + aliases { + serial0 = &uarths0; + }; + chosen { bootargs = "earlycon console=ttySIF0"; stdout-path = "serial0:115200n8"; @@ -24,7 +28,6 @@ &fpioa { pinctrl-0 = <&jtag_pins>; pinctrl-names = "default"; - status = "okay"; jtag_pins: jtag-pinmux { pinmux = <K210_FPIOA(0, K210_PCF_JTAG_TCLK)>, diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts index 6d25bf07481a..2ab376d609d2 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_bit.dts @@ -17,6 +17,10 @@ compatible = "sipeed,maix-bit", "sipeed,maix-bitm", "canaan,kendryte-k210"; + aliases { + serial0 = &uarths0; + }; + chosen { bootargs = "earlycon console=ttySIF0"; stdout-path = "serial0:115200n8"; @@ -58,7 +62,6 @@ &fpioa { pinctrl-names = "default"; pinctrl-0 = <&jtag_pinctrl>; - status = "okay"; jtag_pinctrl: jtag-pinmux { pinmux = <K210_FPIOA(0, K210_PCF_JTAG_TCLK)>, @@ -156,6 +159,7 @@ #sound-dai-cells = <1>; pinctrl-0 = <&i2s0_pinctrl>; pinctrl-names = "default"; + status = "okay"; }; &i2c1 { @@ -170,6 +174,7 @@ pinctrl-names = "default"; num-cs = <1>; cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + status = "okay"; panel@0 { compatible = "sitronix,st7789v"; @@ -199,6 +204,8 @@ }; &spi3 { + status = "okay"; + flash@0 { compatible = "jedec,spi-nor"; reg = <0>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts index f4f4d8d5e8b8..d98e20775c07 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_dock.dts @@ -17,6 +17,10 @@ compatible = "sipeed,maix-dock-m1", "sipeed,maix-dock-m1w", "canaan,kendryte-k210"; + aliases { + serial0 = &uarths0; + }; + chosen { bootargs = "earlycon console=ttySIF0"; stdout-path = "serial0:115200n8"; @@ -63,7 +67,6 @@ &fpioa { pinctrl-0 = <&jtag_pinctrl>; pinctrl-names = "default"; - status = "okay"; jtag_pinctrl: jtag-pinmux { pinmux = <K210_FPIOA(0, K210_PCF_JTAG_TCLK)>, @@ -159,6 +162,7 @@ #sound-dai-cells = <1>; pinctrl-0 = <&i2s0_pinctrl>; pinctrl-names = "default"; + status = "okay"; }; &i2c1 { @@ -173,6 +177,7 @@ pinctrl-names = "default"; num-cs = <1>; cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + status = "okay"; panel@0 { compatible = "sitronix,st7789v"; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts b/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts index 0d86df47e1ed..79ecd549700a 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maix_go.dts @@ -16,6 +16,10 @@ model = "SiPeed MAIX GO"; compatible = "sipeed,maix-go", "canaan,kendryte-k210"; + aliases { + serial0 = &uarths0; + }; + chosen { bootargs = "earlycon console=ttySIF0"; stdout-path = "serial0:115200n8"; @@ -69,7 +73,6 @@ &fpioa { pinctrl-0 = <&jtag_pinctrl>; pinctrl-names = "default"; - status = "okay"; jtag_pinctrl: jtag-pinmux { pinmux = <K210_FPIOA(0, K210_PCF_JTAG_TCLK)>, @@ -167,6 +170,7 @@ #sound-dai-cells = <1>; pinctrl-0 = <&i2s0_pinctrl>; pinctrl-names = "default"; + status = "okay"; }; &i2c1 { @@ -181,6 +185,7 @@ pinctrl-names = "default"; num-cs = <1>; cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + status = "okay"; panel@0 { compatible = "sitronix,st7789v"; @@ -209,6 +214,8 @@ }; &spi3 { + status = "okay"; + flash@0 { compatible = "jedec,spi-nor"; reg = <0>; diff --git a/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts b/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts index 5c05c498e2b8..019c03ae51f6 100644 --- a/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts +++ b/arch/riscv/boot/dts/canaan/sipeed_maixduino.dts @@ -15,6 +15,10 @@ model = "SiPeed MAIXDUINO"; compatible = "sipeed,maixduino", "canaan,kendryte-k210"; + aliases { + serial0 = &uarths0; + }; + chosen { bootargs = "earlycon console=ttySIF0"; stdout-path = "serial0:115200n8"; @@ -39,8 +43,6 @@ }; &fpioa { - status = "okay"; - uarths_pinctrl: uarths-pinmux { pinmux = <K210_FPIOA(4, K210_PCF_UARTHS_RX)>, /* Header "0" */ <K210_FPIOA(5, K210_PCF_UARTHS_TX)>; /* Header "1" */ @@ -132,6 +134,7 @@ #sound-dai-cells = <1>; pinctrl-0 = <&i2s0_pinctrl>; pinctrl-names = "default"; + status = "okay"; }; &i2c1 { @@ -146,6 +149,7 @@ pinctrl-names = "default"; num-cs = <1>; cs-gpios = <&gpio0 20 GPIO_ACTIVE_HIGH>; + status = "okay"; panel@0 { compatible = "sitronix,st7789v"; @@ -174,6 +178,8 @@ }; &spi3 { + status = "okay"; + flash@0 { compatible = "jedec,spi-nor"; reg = <0>; diff --git a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi index 8ff6ea64f048..68d16717db8c 100644 --- a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi +++ b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi @@ -244,7 +244,7 @@ regulator-boot-on; regulator-always-on; regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; regulator-name = "emmc_vdd"; }; }; diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c index ed9cad20c039..3c830a6f7ef4 100644 --- a/arch/riscv/kernel/machine_kexec.c +++ b/arch/riscv/kernel/machine_kexec.c @@ -121,20 +121,12 @@ static void machine_kexec_mask_interrupts(void) for_each_irq_desc(i, desc) { struct irq_chip *chip; - int ret; chip = irq_desc_get_chip(desc); if (!chip) continue; - /* - * First try to remove the active state. If this - * fails, try to EOI the interrupt. - */ - ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); - - if (ret && irqd_irq_inprogress(&desc->irq_data) && - chip->irq_eoi) + if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) chip->irq_eoi(&desc->irq_data); if (chip->irq_mask) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 0d3f00eb0bae..10e311b2759d 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -32,6 +32,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, bool (*fn)(void *, unsigned long), void *arg) { unsigned long fp, sp, pc; + int graph_idx = 0; int level = 0; if (regs) { @@ -68,7 +69,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, pc = regs->ra; } else { fp = frame->fp; - pc = ftrace_graph_ret_addr(current, NULL, frame->ra, + pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra, &frame->ra); if (pc == (unsigned long)ret_from_exception) { if (unlikely(!__kernel_text_address(pc) || !fn(arg, pc))) diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index 04db1f993c47..bcf41d6e0df0 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -327,7 +327,7 @@ static long kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_att event = perf_event_create_kernel_counter(attr, -1, current, kvm_riscv_pmu_overflow, pmc); if (IS_ERR(event)) { - pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event)); + pr_debug("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event)); return PTR_ERR(event); } diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 8c4adece8911..f3602414a961 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -601,17 +601,16 @@ CONFIG_WATCHDOG=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m +CONFIG_DRM=m +CONFIG_DRM_VIRTIO_GPU=m CONFIG_FB=y # CONFIG_FB_DEVICE is not set -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y # CONFIG_HID_SUPPORT is not set # CONFIG_USB_SUPPORT is not set CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m -CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 6dd11d3b6aaa..d0d8925fdf09 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -592,17 +592,16 @@ CONFIG_WATCHDOG_CORE=y CONFIG_WATCHDOG_NOWAYOUT=y CONFIG_SOFT_WATCHDOG=m CONFIG_DIAG288_WATCHDOG=m +CONFIG_DRM=m +CONFIG_DRM_VIRTIO_GPU=m CONFIG_FB=y # CONFIG_FB_DEVICE is not set -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y # CONFIG_HID_SUPPORT is not set # CONFIG_USB_SUPPORT is not set CONFIG_INFINIBAND=m CONFIG_INFINIBAND_USER_ACCESS=m CONFIG_MLX4_INFINIBAND=m CONFIG_MLX5_INFINIBAND=m -CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 95990461888f..9281063636a7 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -427,6 +427,7 @@ struct kvm_vcpu_stat { u64 instruction_io_other; u64 instruction_lpsw; u64 instruction_lpswe; + u64 instruction_lpswey; u64 instruction_pfmf; u64 instruction_ptff; u64 instruction_sck; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 82e9631cd9ef..54b5b2565df8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -132,6 +132,7 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_io_other), STATS_DESC_COUNTER(VCPU, instruction_lpsw), STATS_DESC_COUNTER(VCPU, instruction_lpswe), + STATS_DESC_COUNTER(VCPU, instruction_lpswey), STATS_DESC_COUNTER(VCPU, instruction_pfmf), STATS_DESC_COUNTER(VCPU, instruction_ptff), STATS_DESC_COUNTER(VCPU, instruction_sck), diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 111eb5c74784..bf8534218af3 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -138,6 +138,21 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar) return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; } +static inline u64 kvm_s390_get_base_disp_siy(struct kvm_vcpu *vcpu, u8 *ar) +{ + u32 base1 = vcpu->arch.sie_block->ipb >> 28; + s64 disp1; + + /* The displacement is a 20bit _SIGNED_ value */ + disp1 = sign_extend64(((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + + ((vcpu->arch.sie_block->ipb & 0xff00) << 4), 19); + + if (ar) + *ar = base1; + + return (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1; +} + static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, u64 *address1, u64 *address2, u8 *ar_b1, u8 *ar_b2) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 1be19cc9d73c..1a49b89706f8 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -797,6 +797,36 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) return 0; } +static int handle_lpswey(struct kvm_vcpu *vcpu) +{ + psw_t new_psw; + u64 addr; + int rc; + u8 ar; + + vcpu->stat.instruction_lpswey++; + + if (!test_kvm_facility(vcpu->kvm, 193)) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + addr = kvm_s390_get_base_disp_siy(vcpu, &ar); + if (addr & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + rc = read_guest(vcpu, addr, ar, &new_psw, sizeof(new_psw)); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); + + vcpu->arch.sie_block->gpsw = new_psw; + if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + return 0; +} + static int handle_stidp(struct kvm_vcpu *vcpu) { u64 stidp_data = vcpu->kvm->arch.model.cpuid; @@ -1462,6 +1492,8 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) case 0x61: case 0x62: return handle_ri(vcpu); + case 0x71: + return handle_lpswey(vcpu); default: return -EOPNOTSUPP; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index abb629d7e131..7e3e767ab87d 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -55,6 +55,8 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) void crst_table_free(struct mm_struct *mm, unsigned long *table) { + if (!table) + return; pagetable_free(virt_to_ptdesc(table)); } @@ -262,6 +264,8 @@ static unsigned long *base_crst_alloc(unsigned long val) static void base_crst_free(unsigned long *table) { + if (!table) + return; pagetable_free(virt_to_ptdesc(table)); } diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index ef805eaa9e01..9f1e76ddda5a 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -447,43 +447,31 @@ static int bulk_req_safe_read( return n; } -/* Called without dev->lock held, and only in interrupt context. */ -static void ubd_handler(void) +static void ubd_end_request(struct io_thread_req *io_req) { - int n; - int count; - - while(1){ - n = bulk_req_safe_read( - thread_fd, - irq_req_buffer, - &irq_remainder, - &irq_remainder_size, - UBD_REQ_BUFFER_SIZE - ); - if (n < 0) { - if(n == -EAGAIN) - break; - printk(KERN_ERR "spurious interrupt in ubd_handler, " - "err = %d\n", -n); - return; - } - for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { - struct io_thread_req *io_req = (*irq_req_buffer)[count]; - - if ((io_req->error == BLK_STS_NOTSUPP) && (req_op(io_req->req) == REQ_OP_DISCARD)) { - blk_queue_max_discard_sectors(io_req->req->q, 0); - blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); - } - blk_mq_end_request(io_req->req, io_req->error); - kfree(io_req); - } + if (io_req->error == BLK_STS_NOTSUPP) { + if (req_op(io_req->req) == REQ_OP_DISCARD) + blk_queue_disable_discard(io_req->req->q); + else if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES) + blk_queue_disable_write_zeroes(io_req->req->q); } + blk_mq_end_request(io_req->req, io_req->error); + kfree(io_req); } static irqreturn_t ubd_intr(int irq, void *dev) { - ubd_handler(); + int len, i; + + while ((len = bulk_req_safe_read(thread_fd, irq_req_buffer, + &irq_remainder, &irq_remainder_size, + UBD_REQ_BUFFER_SIZE)) >= 0) { + for (i = 0; i < len / sizeof(struct io_thread_req *); i++) + ubd_end_request((*irq_req_buffer)[i]); + } + + if (len < 0 && len != -EAGAIN) + pr_err("spurious interrupt in %s, err = %d\n", __func__, len); return IRQ_HANDLED; } @@ -847,6 +835,7 @@ static int ubd_add(int n, char **error_out) struct queue_limits lim = { .max_segments = MAX_SG, .seg_boundary_mask = PAGE_SIZE - 1, + .features = BLK_FEAT_WRITE_CACHE, }; struct gendisk *disk; int err = 0; @@ -893,8 +882,6 @@ static int ubd_add(int n, char **error_out) goto out_cleanup_tags; } - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_write_cache(disk->queue, true, false); disk->major = UBD_MAJOR; disk->first_minor = n << UBD_SHIFT; disk->minors = 1 << UBD_SHIFT; diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 11c9b8efdc4c..ed0a5f2dc129 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -89,10 +89,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) cld - IBRS_ENTER - UNTRAIN_RET - CLEAR_BRANCH_HISTORY - /* * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether @@ -116,6 +112,16 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: + /* + * CPU bugs mitigations mechanisms can call other functions. They + * should be invoked after making sure TF is cleared because + * single-step is ignored only for instructions inside the + * entry_SYSENTER_compat function. + */ + IBRS_ENTER + UNTRAIN_RET + CLEAR_BRANCH_HISTORY + movq %rsp, %rdi call do_SYSENTER_32 jmp sysret32_from_system_call diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h new file mode 100644 index 000000000000..24e3a53ca255 --- /dev/null +++ b/arch/x86/include/asm/runtime-const.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RUNTIME_CONST_H +#define _ASM_RUNTIME_CONST_H + +#define runtime_const_ptr(sym) ({ \ + typeof(sym) __ret; \ + asm_inline("mov %1,%0\n1:\n" \ + ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \ + ".long 1b - %c2 - .\n\t" \ + ".popsection" \ + :"=r" (__ret) \ + :"i" ((unsigned long)0x0123456789abcdefull), \ + "i" (sizeof(long))); \ + __ret; }) + +// The 'typeof' will create at _least_ a 32-bit type, but +// will happily also take a bigger type and the 'shrl' will +// clear the upper bits +#define runtime_const_shift_right_32(val, sym) ({ \ + typeof(0u+(val)) __ret = (val); \ + asm_inline("shrl $12,%k0\n1:\n" \ + ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \ + ".long 1b - 1 - .\n\t" \ + ".popsection" \ + :"+r" (__ret)); \ + __ret; }) + +#define runtime_const_init(type, sym) do { \ + extern s32 __start_runtime_##type##_##sym[]; \ + extern s32 __stop_runtime_##type##_##sym[]; \ + runtime_const_fixup(__runtime_fixup_##type, \ + (unsigned long)(sym), \ + __start_runtime_##type##_##sym, \ + __stop_runtime_##type##_##sym); \ +} while (0) + +/* + * The text patching is trivial - you can only do this at init time, + * when the text section hasn't been marked RO, and before the text + * has ever been executed. + */ +static inline void __runtime_fixup_ptr(void *where, unsigned long val) +{ + *(unsigned long *)where = val; +} + +static inline void __runtime_fixup_shift(void *where, unsigned long val) +{ + *(unsigned char *)where = val; +} + +static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), + unsigned long val, s32 *start, s32 *end) +{ + while (start < end) { + fn(*start + (void *)start, val); + start++; + } +} + +#endif diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 405efb3e4996..94408a784c8e 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -28,9 +28,6 @@ static inline cycles_t get_cycles(void) } #define get_cycles get_cycles -extern struct system_counterval_t convert_art_to_tsc(u64 art); -extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); - extern void tsc_early_init(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 0ef36190abe6..b2d2df026f6e 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -328,9 +328,8 @@ static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, * due to unsigned comparison. * * Due to the MSB/Sign-bit being used as invalid marker (see - * arch_vdso_cycles_valid() above), the effective mask is S64_MAX, - * but that case is also unlikely and will also take the unlikely path - * here. + * arch_vdso_cycles_ok() above), the effective mask is S64_MAX, but that + * case is also unlikely and will also take the unlikely path here. */ if (unlikely(delta > vd->max_cycles)) { /* diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index be199a9b2676..93226281b450 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -4,7 +4,6 @@ #ifndef __ASSEMBLY__ -#include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <asm/vgtod.h> diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 7aa38b2ad8a9..a0ce291abcae 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -14,11 +14,6 @@ #include <uapi/linux/time.h> -#ifdef BUILD_VDSO32_64 -typedef u64 gtod_long_t; -#else -typedef unsigned long gtod_long_t; -#endif #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index e8d7d4941c4c..422a47746657 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -5,45 +5,12 @@ #include <linux/bitops.h> #include <linux/wordpart.h> -/* - * This is largely generic for little-endian machines, but the - * optimal byte mask counting is probably going to be something - * that is architecture-specific. If you have a reliably fast - * bit count instruction, that might be better than the multiply - * and shift, for example. - */ struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } -#ifdef CONFIG_64BIT - -/* - * Jan Achrenius on G+: microoptimized version of - * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" - * that works for the bytemasks without having to - * mask them first. - */ -static inline long count_masked_bytes(unsigned long mask) -{ - return mask*0x0001020304050608ul >> 56; -} - -#else /* 32-bit case */ - -/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ -static inline long count_masked_bytes(long mask) -{ - /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ - long a = (0x0ff0001+mask) >> 23; - /* Fix the 1 for 00 case */ - return a & mask; -} - -#endif - /* Return nonzero if it has a zero */ static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { @@ -57,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, return bits; } +#ifdef CONFIG_64BIT + +/* Keep the initial has_zero() value for both bitmask and size calc */ +#define create_zero_mask(bits) (bits) + +static inline unsigned long zero_bytemask(unsigned long bits) +{ + bits = (bits - 1) & ~bits; + return bits >> 7; +} + +#define find_zero(bits) (__ffs(bits) >> 3) + +#else + +/* Create the final mask for both bytemask and size */ static inline unsigned long create_zero_mask(unsigned long bits) { bits = (bits - 1) & ~bits; @@ -66,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits) /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ static inline unsigned long find_zero(unsigned long mask) { - return count_masked_bytes(mask); + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; } +#endif + /* * Load an unaligned word from kernel space. * diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 06b170759e5b..d4462fb26299 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -50,9 +50,9 @@ int tsc_clocksource_reliable; static int __read_mostly tsc_force_recalibrate; -static u32 art_to_tsc_numerator; -static u32 art_to_tsc_denominator; -static u64 art_to_tsc_offset; +static struct clocksource_base art_base_clk = { + .id = CSID_X86_ART, +}; static bool have_art; struct cyc2ns { @@ -1074,7 +1074,7 @@ core_initcall(cpufreq_register_tsc_scaling); */ static void __init detect_art(void) { - unsigned int unused[2]; + unsigned int unused; if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; @@ -1089,13 +1089,14 @@ static void __init detect_art(void) tsc_async_resets) return; - cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, - &art_to_tsc_numerator, unused, unused+1); + cpuid(ART_CPUID_LEAF, &art_base_clk.denominator, + &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); - if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) + art_base_clk.freq_khz /= KHZ; + if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; - rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + rdmsrl(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); @@ -1296,67 +1297,6 @@ int unsynchronized_tsc(void) return 0; } -/* - * Convert ART to TSC given numerator/denominator found in detect_art() - */ -struct system_counterval_t convert_art_to_tsc(u64 art) -{ - u64 tmp, res, rem; - - rem = do_div(art, art_to_tsc_denominator); - - res = art * art_to_tsc_numerator; - tmp = rem * art_to_tsc_numerator; - - do_div(tmp, art_to_tsc_denominator); - res += tmp + art_to_tsc_offset; - - return (struct system_counterval_t) { - .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, - .cycles = res, - }; -} -EXPORT_SYMBOL(convert_art_to_tsc); - -/** - * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC. - * @art_ns: ART (Always Running Timer) in unit of nanoseconds - * - * PTM requires all timestamps to be in units of nanoseconds. When user - * software requests a cross-timestamp, this function converts system timestamp - * to TSC. - * - * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set - * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check - * that this flag is set before conversion to TSC is attempted. - * - * Return: - * struct system_counterval_t - system counter value with the ID of the - * corresponding clocksource: - * cycles: System counter value - * cs_id: The clocksource ID for validating comparability - */ - -struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) -{ - u64 tmp, res, rem; - - rem = do_div(art_ns, USEC_PER_SEC); - - res = art_ns * tsc_khz; - tmp = rem * tsc_khz; - - do_div(tmp, USEC_PER_SEC); - res += tmp; - - return (struct system_counterval_t) { - .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, - .cycles = res, - }; -} -EXPORT_SYMBOL(convert_art_ns_to_tsc); - - static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** @@ -1458,8 +1398,10 @@ out: if (tsc_unstable) goto unreg; - if (boot_cpu_has(X86_FEATURE_ART)) + if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; + clocksource_tsc.base = &art_base_clk; + } clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); @@ -1484,8 +1426,10 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { - if (boot_cpu_has(X86_FEATURE_ART)) + if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; + clocksource_tsc.base = &art_base_clk; + } clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); @@ -1509,10 +1453,12 @@ static bool __init determine_cpu_tsc_frequencies(bool early) if (early) { cpu_khz = x86_platform.calibrate_cpu(); - if (tsc_early_khz) + if (tsc_early_khz) { tsc_khz = tsc_early_khz; - else + } else { tsc_khz = x86_platform.calibrate_tsc(); + clocksource_tsc.freq_khz = tsc_khz; + } } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3509afc6a672..6e73403e874f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -357,6 +357,9 @@ SECTIONS PERCPU_SECTION(INTERNODE_CACHE_BYTES) #endif + RUNTIME_CONST(shift, d_hash_shift) + RUNTIME_CONST(ptr, dentry_hashtable) + . = ALIGN(PAGE_SIZE); /* freed after init ends here */ diff --git a/arch/xtensa/include/asm/current.h b/arch/xtensa/include/asm/current.h index 08010dbf5e09..df275d554788 100644 --- a/arch/xtensa/include/asm/current.h +++ b/arch/xtensa/include/asm/current.h @@ -19,7 +19,7 @@ struct task_struct; -static inline struct task_struct *get_current(void) +static __always_inline struct task_struct *get_current(void) { return current_thread_info()->task; } diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 326db1c1d5d8..e0dffcc43b9e 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -91,7 +91,7 @@ struct thread_info { } /* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) +static __always_inline struct thread_info *current_thread_info(void) { struct thread_info *ti; __asm__("extui %0, a1, 0, "__stringify(CURRENT_SHIFT)"\n\t" diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index defc67909a9c..d6d2b533a574 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -263,6 +263,9 @@ static const struct proc_ops simdisk_proc_ops = { static int __init simdisk_setup(struct simdisk *dev, int which, struct proc_dir_entry *procdir) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; char tmp[2] = { '0' + which, 0 }; int err; @@ -271,7 +274,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which, spin_lock_init(&dev->lock); dev->users = 0; - dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE); + dev->gd = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(dev->gd)) { err = PTR_ERR(dev->gd); goto out; diff --git a/block/Kconfig b/block/Kconfig index dc12af58dbae..5b623b876d3b 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -62,6 +62,8 @@ config BLK_DEV_BSGLIB config BLK_DEV_INTEGRITY bool "Block layer data integrity support" + select CRC_T10DIF + select CRC64_ROCKSOFT help Some storage devices allow extra information to be stored/retrieved to help protect the data. The block layer @@ -72,12 +74,6 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. -config BLK_DEV_INTEGRITY_T10 - tristate - depends on BLK_DEV_INTEGRITY - select CRC_T10DIF - select CRC64_ROCKSOFT - config BLK_DEV_WRITE_MOUNTED bool "Allow writing to mounted block devices" default y diff --git a/block/Makefile b/block/Makefile index 168150b9c510..ddfd21c1a9ff 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,8 +26,7 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o -obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o diff --git a/block/bdev.c b/block/bdev.c index 353677ac49b3..c5507b6f63b8 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -385,7 +385,7 @@ static struct file_system_type bd_type = { }; struct super_block *blockdev_superblock __ro_after_init; -struct vfsmount *blockdev_mnt __ro_after_init; +static struct vfsmount *blockdev_mnt __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) @@ -1260,23 +1260,42 @@ void sync_bdevs(bool wait) } /* - * Handle STATX_DIOALIGN for block devices. - * - * Note that the inode passed to this is the inode of a block device node file, - * not the block device's internal inode. Therefore it is *not* valid to use - * I_BDEV() here; the block device has to be looked up by i_rdev instead. + * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. */ -void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +void bdev_statx(struct path *path, struct kstat *stat, + u32 request_mask) { + struct inode *backing_inode; struct block_device *bdev; - bdev = blkdev_get_no_open(inode->i_rdev); + if (!(request_mask & (STATX_DIOALIGN | STATX_WRITE_ATOMIC))) + return; + + backing_inode = d_backing_inode(path->dentry); + + /* + * Note that backing_inode is the inode of a block device node file, + * not the block device's internal inode. Therefore it is *not* valid + * to use I_BDEV() here; the block device has to be looked up by i_rdev + * instead. + */ + bdev = blkdev_get_no_open(backing_inode->i_rdev); if (!bdev) return; - stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; - stat->dio_offset_align = bdev_logical_block_size(bdev); - stat->result_mask |= STATX_DIOALIGN; + if (request_mask & STATX_DIOALIGN) { + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + stat->result_mask |= STATX_DIOALIGN; + } + + if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { + struct request_queue *bd_queue = bdev->bd_queue; + + generic_fill_statx_atomic_writes(stat, + queue_atomic_write_unit_min_bytes(bd_queue), + queue_atomic_write_unit_max_bytes(bd_queue)); + } blkdev_put_no_open(bdev); } diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d442ee358fc2..b758693697c0 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -797,57 +797,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) */ bfq_link_bfqg(bfqd, bfqg); __bfq_bic_change_cgroup(bfqd, bic, bfqg); - /* - * Update blkg_path for bfq_log_* functions. We cache this - * path, and update it here, for the following - * reasons. Operations on blkg objects in blk-cgroup are - * protected with the request_queue lock, and not with the - * lock that protects the instances of this scheduler - * (bfqd->lock). This exposes BFQ to the following sort of - * race. - * - * The blkg_lookup performed in bfq_get_queue, protected - * through rcu, may happen to return the address of a copy of - * the original blkg. If this is the case, then the - * bfqg_and_blkg_get performed in bfq_get_queue, to pin down - * the blkg, is useless: it does not prevent blk-cgroup code - * from destroying both the original blkg and all objects - * directly or indirectly referred by the copy of the - * blkg. - * - * On the bright side, destroy operations on a blkg invoke, as - * a first step, hooks of the scheduler associated with the - * blkg. And these hooks are executed with bfqd->lock held for - * BFQ. As a consequence, for any blkg associated with the - * request queue this instance of the scheduler is attached - * to, we are guaranteed that such a blkg is not destroyed, and - * that all the pointers it contains are consistent, while we - * are holding bfqd->lock. A blkg_lookup performed with - * bfqd->lock held then returns a fully consistent blkg, which - * remains consistent until this lock is held. - * - * Thanks to the last fact, and to the fact that: (1) bfqg has - * been obtained through a blkg_lookup in the above - * assignment, and (2) bfqd->lock is being held, here we can - * safely use the policy data for the involved blkg (i.e., the - * field bfqg->pd) to get to the blkg associated with bfqg, - * and then we can safely use any field of blkg. After we - * release bfqd->lock, even just getting blkg through this - * bfqg may cause dangling references to be traversed, as - * bfqg->pd may not exist any more. - * - * In view of the above facts, here we cache, in the bfqg, any - * blkg data we may need for this bic, and for its associated - * bfq_queue. As of now, we need to cache only the path of the - * blkg, which is used in the bfq_log_* functions. - * - * Finally, note that bfqg itself needs to be protected from - * destruction on the blkg_free of the original blkg (which - * invokes bfq_pd_free). We use an additional private - * refcounter for bfqg, to let it disappear only after no - * bfq_queue refers to it any longer. - */ - blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4b88a54a9b76..36a4998c4b37 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5463,40 +5463,42 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, } } +static void _bfq_exit_icq(struct bfq_io_cq *bic, unsigned int num_actuators) +{ + struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; + unsigned int act_idx; + + for (act_idx = 0; act_idx < num_actuators; act_idx++) { + if (bfqq_data[act_idx].stable_merge_bfqq) + bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); + + bfq_exit_icq_bfqq(bic, true, act_idx); + bfq_exit_icq_bfqq(bic, false, act_idx); + } +} + static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); struct bfq_data *bfqd = bic_to_bfqd(bic); unsigned long flags; - unsigned int act_idx; + /* * If bfqd and thus bfqd->num_actuators is not available any * longer, then cycle over all possible per-actuator bfqqs in * next loop. We rely on bic being zeroed on creation, and * therefore on its unused per-actuator fields being NULL. - */ - unsigned int num_actuators = BFQ_MAX_ACTUATORS; - struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; - - /* + * * bfqd is NULL if scheduler already exited, and in that case * this is the last time these queues are accessed. */ if (bfqd) { spin_lock_irqsave(&bfqd->lock, flags); - num_actuators = bfqd->num_actuators; - } - - for (act_idx = 0; act_idx < num_actuators; act_idx++) { - if (bfqq_data[act_idx].stable_merge_bfqq) - bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); - - bfq_exit_icq_bfqq(bic, true, act_idx); - bfq_exit_icq_bfqq(bic, false, act_idx); - } - - if (bfqd) + _bfq_exit_icq(bic, bfqd->num_actuators); spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + _bfq_exit_icq(bic, BFQ_MAX_ACTUATORS); + } } /* diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 467e8cfc41a2..08ddf2cfae5b 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1003,9 +1003,6 @@ struct bfq_group { /* must be the first member */ struct blkg_policy_data pd; - /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ - char blkg_path[128]; - /* reference counter (see comments in bfq_bic_update_cgroup) */ refcount_t ref; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8b528e12136f..b78c145eb026 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -76,7 +76,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; - } else { + } else if (nr_vecs) { bip->bip_vec = bip->bip_inline_vecs; } @@ -276,6 +276,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; bip->bip_iter.bi_sector = seed; + bip->bip_vcnt = nr_vecs; return 0; free_bip: bio_integrity_free(bio); @@ -297,6 +298,7 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, bip->bip_flags |= BIP_INTEGRITY_USER; bip->bip_iter.bi_sector = seed; bip->bip_iter.bi_size = len; + bip->bip_vcnt = nr_vecs; return 0; } @@ -334,7 +336,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, u32 seed) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int align = q->dma_pad_mask | queue_dma_alignment(q); + unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; unsigned int direction, nr_bvecs; @@ -397,44 +399,6 @@ free_bvec: EXPORT_SYMBOL_GPL(bio_integrity_map_user); /** - * bio_integrity_process - Process integrity metadata for a bio - * @bio: bio to generate/verify integrity metadata for - * @proc_iter: iterator to process - * @proc_fn: Pointer to the relevant processing function - */ -static blk_status_t bio_integrity_process(struct bio *bio, - struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) -{ - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; - struct bio_integrity_payload *bip = bio_integrity(bio); - blk_status_t ret = BLK_STS_OK; - - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.tuple_size = bi->tuple_size; - iter.seed = proc_iter->bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - iter.pi_offset = bi->pi_offset; - - __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = bvec_kmap_local(&bv); - - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - ret = proc_fn(&iter); - kunmap_local(kaddr); - - if (ret) - break; - - } - return ret; -} - -/** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare * @@ -450,17 +414,13 @@ bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned int len; void *buf; - unsigned long start, end; - unsigned int len, nr_pages; - unsigned int bytes, offset, i; + gfp_t gfp = GFP_NOIO; if (!bi) return true; - if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) - return true; - if (!bio_sectors(bio)) return true; @@ -468,32 +428,36 @@ bool bio_integrity_prep(struct bio *bio) if (bio_integrity(bio)) return true; - if (bio_data_dir(bio) == READ) { - if (!bi->profile->verify_fn || - !(bi->flags & BLK_INTEGRITY_VERIFY)) + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bi->flags & BLK_INTEGRITY_NOVERIFY) return true; - } else { - if (!bi->profile->generate_fn || - !(bi->flags & BLK_INTEGRITY_GENERATE)) + break; + case REQ_OP_WRITE: + if (bi->flags & BLK_INTEGRITY_NOGENERATE) return true; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. + */ + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + gfp |= __GFP_ZERO; + break; + default: + return true; } /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, GFP_NOIO); + buf = kmalloc(len, gfp); if (unlikely(buf == NULL)) { - printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; } - end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ((unsigned long) buf) >> PAGE_SHIFT; - nr_pages = end - start; - - /* Allocate bio integrity payload and integrity vectors */ - bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); if (IS_ERR(bip)) { - printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); goto err_end_io; } @@ -501,35 +465,20 @@ bool bio_integrity_prep(struct bio *bio) bip->bip_flags |= BIP_BLOCK_INTEGRITY; bip_set_seed(bip, bio->bi_iter.bi_sector); - if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; - /* Map it */ - offset = offset_in_page(buf); - for (i = 0; i < nr_pages && len > 0; i++) { - bytes = PAGE_SIZE - offset; - - if (bytes > len) - bytes = len; - - if (bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset) < bytes) { - printk(KERN_ERR "could not attach integrity payload\n"); - goto err_end_io; - } - - buf += bytes; - len -= bytes; - offset = 0; + if (bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)) < len) { + printk(KERN_ERR "could not attach integrity payload\n"); + goto err_end_io; } /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) { - bio_integrity_process(bio, &bio->bi_iter, - bi->profile->generate_fn); - } else { + if (bio_data_dir(bio) == WRITE) + blk_integrity_generate(bio); + else bip->bio_iter = bio->bi_iter; - } return true; err_end_io: @@ -552,15 +501,8 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - /* - * At the moment verify is called bio's iterator was advanced - * during split and completion, we need to rewind iterator to - * it's original position. - */ - bio->bi_status = bio_integrity_process(bio, &bip->bio_iter, - bi->profile->verify_fn); + blk_integrity_verify(bio); bio_integrity_free(bio); bio_endio(bio); } @@ -582,7 +524,7 @@ bool __bio_integrity_endio(struct bio *bio) struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && - (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) { + (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->csum_type) { INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); return false; @@ -642,14 +584,11 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); + bip = bio_integrity_alloc(bio, gfp_mask, 0); if (IS_ERR(bip)) return PTR_ERR(bip); - memcpy(bip->bip_vec, bip_src->bip_vec, - bip_src->bip_vcnt * sizeof(struct bio_vec)); - - bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; diff --git a/block/bio.c b/block/bio.c index e9e809a63c59..a3b1b2266c50 100644 --- a/block/bio.c +++ b/block/bio.c @@ -953,7 +953,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, bool *same_page) { unsigned long mask = queue_segment_boundary(q); - phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; + phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 90b3959d88cf..bd472a30bc61 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -301,19 +301,6 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) } /** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); -} - -/** * blkg_get - get a blkg reference * @blkg: blkg to get * diff --git a/block/blk-core.c b/block/blk-core.c index 82c3ae22d76d..02bceeb36f2c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -94,20 +94,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) } EXPORT_SYMBOL(blk_queue_flag_clear); -/** - * blk_queue_flag_test_and_set - atomically test and set a queue flag - * @flag: flag to be set - * @q: request queue - * - * Returns the previous value of @flag - 0 if the flag was not set and 1 if - * the flag was already set. - */ -bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) -{ - return test_and_set_bit(flag, &q->queue_flags); -} -EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); - #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), @@ -174,6 +160,8 @@ static const struct { /* Command duration limit device-side timeout */ [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, + [BLK_STS_INVAL] = { -EINVAL, "invalid" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -739,6 +727,18 @@ void submit_bio_noacct_nocheck(struct bio *bio) __submit_bio_noacct(bio); } +static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, + struct bio *bio) +{ + if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) + return BLK_STS_INVAL; + + if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) + return BLK_STS_INVAL; + + return BLK_STS_OK; +} + /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. @@ -782,7 +782,7 @@ void submit_bio_noacct(struct bio *bio) if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + if (!bdev_write_cache(bdev)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; @@ -791,12 +791,17 @@ void submit_bio_noacct(struct bio *bio) } } - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!(q->limits.features & BLK_FEAT_POLL)) bio_clear_polled(bio); switch (bio_op(bio)) { case REQ_OP_READ: case REQ_OP_WRITE: + if (bio->bi_opf & REQ_ATOMIC) { + status = blk_validate_atomic_write_op_size(q, bio); + if (status != BLK_STS_OK) + goto end_io; + } break; case REQ_OP_FLUSH: /* @@ -825,11 +830,8 @@ void submit_bio_noacct(struct bio *bio) case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - if (!bdev_is_zoned(bio->bi_bdev)) - goto not_supported; - break; case REQ_OP_ZONE_RESET_ALL: - if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) + if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; case REQ_OP_DRV_IN: @@ -915,8 +917,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0; q = bdev_get_queue(bdev); - if (cookie == BLK_QC_T_NONE || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL)) return 0; blk_flush_plug(current->plug, false); diff --git a/block/blk-flush.c b/block/blk-flush.c index cca4f9131f79..a72e2a83d075 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -100,23 +100,6 @@ blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; } -static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) -{ - unsigned int policy = 0; - - if (blk_rq_sectors(rq)) - policy |= REQ_FSEQ_DATA; - - if (fflags & (1UL << QUEUE_FLAG_WC)) { - if (rq->cmd_flags & REQ_PREFLUSH) - policy |= REQ_FSEQ_PREFLUSH; - if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && - (rq->cmd_flags & REQ_FUA)) - policy |= REQ_FSEQ_POSTFLUSH; - } - return policy; -} - static unsigned int blk_flush_cur_seq(struct request *rq) { return 1 << ffz(rq->flush.seq); @@ -399,19 +382,32 @@ static void blk_rq_init_flush(struct request *rq) bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - unsigned long fflags = q->queue_flags; /* may change, cache */ - unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + bool supports_fua = q->limits.features & BLK_FEAT_FUA; + unsigned int policy = 0; /* FLUSH/FUA request must never be merged */ WARN_ON_ONCE(rq->bio != rq->biotail); + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + + /* + * Check which flushes we need to sequence for this operation. + */ + if (blk_queue_write_cache(q)) { + if (rq->cmd_flags & REQ_PREFLUSH) + policy |= REQ_FSEQ_PREFLUSH; + if ((rq->cmd_flags & REQ_FUA) && !supports_fua) + policy |= REQ_FSEQ_POSTFLUSH; + } + /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_PREFLUSH; - if (!(fflags & (1UL << QUEUE_FLAG_FUA))) + if (!supports_fua) rq->cmd_flags &= ~REQ_FUA; /* diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ccbeb6dfa87a..010decc892ea 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -107,60 +107,6 @@ new_segment: } EXPORT_SYMBOL(blk_rq_map_integrity_sg); -/** - * blk_integrity_compare - Compare integrity profile of two disks - * @gd1: Disk to compare - * @gd2: Disk to compare - * - * Description: Meta-devices like DM and MD need to verify that all - * sub-devices use the same integrity format before advertising to - * upper layers that they can send/receive integrity metadata. This - * function can be used to check whether two gendisk devices have - * compatible integrity formats. - */ -int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) -{ - struct blk_integrity *b1 = &gd1->queue->integrity; - struct blk_integrity *b2 = &gd2->queue->integrity; - - if (!b1->profile && !b2->profile) - return 0; - - if (!b1->profile || !b2->profile) - return -1; - - if (b1->interval_exp != b2->interval_exp) { - pr_err("%s: %s/%s protection interval %u != %u\n", - __func__, gd1->disk_name, gd2->disk_name, - 1 << b1->interval_exp, 1 << b2->interval_exp); - return -1; - } - - if (b1->tuple_size != b2->tuple_size) { - pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->tuple_size, b2->tuple_size); - return -1; - } - - if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { - pr_err("%s: %s/%s tag sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->tag_size, b2->tag_size); - return -1; - } - - if (b1->profile != b2->profile) { - pr_err("%s: %s/%s type %s != %s\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->profile->name, b2->profile->name); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(blk_integrity_compare); - bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, struct request *next) { @@ -214,7 +160,64 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, static inline struct blk_integrity *dev_to_bi(struct device *dev) { - return &dev_to_disk(dev)->queue->integrity; + return &dev_to_disk(dev)->queue->limits.integrity; +} + +const char *blk_integrity_profile_name(struct blk_integrity *bi) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "T10-DIF-TYPE1-IP"; + return "T10-DIF-TYPE3-IP"; + case BLK_INTEGRITY_CSUM_CRC: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "T10-DIF-TYPE1-CRC"; + return "T10-DIF-TYPE3-CRC"; + case BLK_INTEGRITY_CSUM_CRC64: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "EXT-DIF-TYPE1-CRC64"; + return "EXT-DIF-TYPE3-CRC64"; + case BLK_INTEGRITY_CSUM_NONE: + break; + } + + return "nop"; +} +EXPORT_SYMBOL_GPL(blk_integrity_profile_name); + +static ssize_t flag_store(struct device *dev, const char *page, size_t count, + unsigned char flag) +{ + struct request_queue *q = dev_to_disk(dev)->queue; + struct queue_limits lim; + unsigned long val; + int err; + + err = kstrtoul(page, 10, &val); + if (err) + return err; + + /* note that the flags are inverted vs the values in the sysfs files */ + lim = queue_limits_start_update(q); + if (val) + lim.integrity.flags &= ~flag; + else + lim.integrity.flags |= flag; + + blk_mq_freeze_queue(q); + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + if (err) + return err; + return count; +} + +static ssize_t flag_show(struct device *dev, char *page, unsigned char flag) +{ + struct blk_integrity *bi = dev_to_bi(dev); + + return sysfs_emit(page, "%d\n", !(bi->flags & flag)); } static ssize_t format_show(struct device *dev, struct device_attribute *attr, @@ -222,9 +225,9 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr, { struct blk_integrity *bi = dev_to_bi(dev); - if (bi->profile && bi->profile->name) - return sysfs_emit(page, "%s\n", bi->profile->name); - return sysfs_emit(page, "none\n"); + if (!bi->tuple_size) + return sysfs_emit(page, "none\n"); + return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi)); } static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr, @@ -249,49 +252,26 @@ static ssize_t read_verify_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = dev_to_bi(dev); - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); - - if (val) - bi->flags |= BLK_INTEGRITY_VERIFY; - else - bi->flags &= ~BLK_INTEGRITY_VERIFY; - - return count; + return flag_store(dev, page, count, BLK_INTEGRITY_NOVERIFY); } static ssize_t read_verify_show(struct device *dev, struct device_attribute *attr, char *page) { - struct blk_integrity *bi = dev_to_bi(dev); - - return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY)); + return flag_show(dev, page, BLK_INTEGRITY_NOVERIFY); } static ssize_t write_generate_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = dev_to_bi(dev); - - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); - - if (val) - bi->flags |= BLK_INTEGRITY_GENERATE; - else - bi->flags &= ~BLK_INTEGRITY_GENERATE; - - return count; + return flag_store(dev, page, count, BLK_INTEGRITY_NOGENERATE); } static ssize_t write_generate_show(struct device *dev, struct device_attribute *attr, char *page) { - struct blk_integrity *bi = dev_to_bi(dev); - - return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE)); + return flag_show(dev, page, BLK_INTEGRITY_NOGENERATE); } static ssize_t device_is_integrity_capable_show(struct device *dev, @@ -325,81 +305,3 @@ const struct attribute_group blk_integrity_attr_group = { .name = "integrity", .attrs = integrity_attrs, }; - -static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) -{ - return BLK_STS_OK; -} - -static void blk_integrity_nop_prepare(struct request *rq) -{ -} - -static void blk_integrity_nop_complete(struct request *rq, - unsigned int nr_bytes) -{ -} - -static const struct blk_integrity_profile nop_profile = { - .name = "nop", - .generate_fn = blk_integrity_nop_fn, - .verify_fn = blk_integrity_nop_fn, - .prepare_fn = blk_integrity_nop_prepare, - .complete_fn = blk_integrity_nop_complete, -}; - -/** - * blk_integrity_register - Register a gendisk as being integrity-capable - * @disk: struct gendisk pointer to make integrity-aware - * @template: block integrity profile to register - * - * Description: When a device needs to advertise itself as being able to - * send/receive integrity metadata it must use this function to register - * the capability with the block layer. The template is a blk_integrity - * struct with values appropriate for the underlying hardware. See - * Documentation/block/data-integrity.rst. - */ -void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | - template->flags; - bi->interval_exp = template->interval_exp ? : - ilog2(queue_logical_block_size(disk->queue)); - bi->profile = template->profile ? template->profile : &nop_profile; - bi->tuple_size = template->tuple_size; - bi->tag_size = template->tag_size; - bi->pi_offset = template->pi_offset; - - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); - -#ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->crypto_profile) { - pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - disk->queue->crypto_profile = NULL; - } -#endif -} -EXPORT_SYMBOL(blk_integrity_register); - -/** - * blk_integrity_unregister - Unregister block integrity profile - * @disk: disk whose integrity profile to unregister - * - * Description: This function unregisters the integrity capability from - * a block device. - */ -void blk_integrity_unregister(struct gendisk *disk) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - if (!bi->profile) - return; - - /* ensure all bios are off the integrity workqueue */ - blk_flush_integrity(); - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); - memset(bi, 0, sizeof(*bi)); -} -EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-lib.c b/block/blk-lib.c index 442da9dad042..9f735efa6c94 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -103,38 +103,71 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); -static int __blkdev_issue_write_zeroes(struct block_device *bdev, - sector_t sector, sector_t nr_sects, gfp_t gfp_mask, - struct bio **biop, unsigned flags) +static sector_t bio_write_zeroes_limit(struct block_device *bdev) { - struct bio *bio = *biop; - unsigned int max_sectors; - - if (bdev_read_only(bdev)) - return -EPERM; - - /* Ensure that max_sectors doesn't overflow bi_size */ - max_sectors = bdev_write_zeroes_sectors(bdev); + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if (max_sectors == 0) - return -EOPNOTSUPP; + return min(bdev_write_zeroes_sectors(bdev), + (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask); +} +static void __blkdev_issue_write_zeroes(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop, unsigned flags) +{ while (nr_sects) { - unsigned int len = min_t(sector_t, nr_sects, max_sectors); + unsigned int len = min_t(sector_t, nr_sects, + bio_write_zeroes_limit(bdev)); + struct bio *bio; + + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) + break; - bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); + bio = bio_alloc(bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; bio->bi_iter.bi_size = len << SECTOR_SHIFT; + *biop = bio_chain_and_submit(*biop, bio); + nr_sects -= len; sector += len; cond_resched(); } +} - *biop = bio; - return 0; +static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp, unsigned flags) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + blk_start_plug(&plug); + __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags); + if (bio) { + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) { + bio_await_chain(bio); + blk_finish_plug(&plug); + return -EINTR; + } + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + /* + * For some devices there is no non-destructive way to verify whether + * WRITE ZEROES is actually supported. These will clear the capability + * on an I/O error, in which case we'll turn any error into + * "not supported" here. + */ + if (ret && !bdev_write_zeroes_sectors(bdev)) + return -EOPNOTSUPP; + return ret; } /* @@ -150,35 +183,63 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) return min(pages, (sector_t)BIO_MAX_VECS); } -static int __blkdev_issue_zero_pages(struct block_device *bdev, +static void __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, - struct bio **biop) + struct bio **biop, unsigned int flags) { - struct bio *bio = *biop; - int bi_size = 0; - unsigned int sz; - - if (bdev_read_only(bdev)) - return -EPERM; + while (nr_sects) { + unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects); + struct bio *bio; - while (nr_sects != 0) { - bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects), - REQ_OP_WRITE, gfp_mask); + bio = bio_alloc(bdev, nr_vecs, REQ_OP_WRITE, gfp_mask); bio->bi_iter.bi_sector = sector; - while (nr_sects != 0) { - sz = min((sector_t) PAGE_SIZE, nr_sects << 9); - bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); - nr_sects -= bi_size >> 9; - sector += bi_size >> 9; - if (bi_size < sz) + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) + break; + + do { + unsigned int len, added; + + len = min_t(sector_t, + PAGE_SIZE, nr_sects << SECTOR_SHIFT); + added = bio_add_page(bio, ZERO_PAGE(0), len, 0); + if (added < len) break; - } + nr_sects -= added >> SECTOR_SHIFT; + sector += added >> SECTOR_SHIFT; + } while (nr_sects); + + *biop = bio_chain_and_submit(*biop, bio); cond_resched(); } +} - *biop = bio; - return 0; +static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp, unsigned flags) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + if (flags & BLKDEV_ZERO_NOFALLBACK) + return -EOPNOTSUPP; + + blk_start_plug(&plug); + __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags); + if (bio) { + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) { + bio_await_chain(bio); + blk_finish_plug(&plug); + return -EINTR; + } + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + return ret; } /** @@ -204,20 +265,19 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { - int ret; - sector_t bs_mask; - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; - - ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, - biop, flags); - if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) - return ret; + if (bdev_read_only(bdev)) + return -EPERM; - return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, - biop); + if (bdev_write_zeroes_sectors(bdev)) { + __blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, biop, flags); + } else { + if (flags & BLKDEV_ZERO_NOFALLBACK) + return -EOPNOTSUPP; + __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, + biop, flags); + } + return 0; } EXPORT_SYMBOL(__blkdev_issue_zeroout); @@ -237,51 +297,21 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags) { - int ret = 0; - sector_t bs_mask; - struct bio *bio; - struct blk_plug plug; - bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); + int ret; - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) + if ((sector | nr_sects) & ((bdev_logical_block_size(bdev) >> 9) - 1)) return -EINVAL; + if (bdev_read_only(bdev)) + return -EPERM; -retry: - bio = NULL; - blk_start_plug(&plug); - if (try_write_zeroes) { - ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, - gfp_mask, &bio, flags); - } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { - ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, - gfp_mask, &bio); - } else { - /* No zeroing offload support */ - ret = -EOPNOTSUPP; - } - if (ret == 0 && bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - blk_finish_plug(&plug); - if (ret && try_write_zeroes) { - if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { - try_write_zeroes = false; - goto retry; - } - if (!bdev_write_zeroes_sectors(bdev)) { - /* - * Zeroing offload support was indicated, but the - * device reported ILLEGAL REQUEST (for some devices - * there is no non-destructive way to verify whether - * WRITE ZEROES is actually supported). - */ - ret = -EOPNOTSUPP; - } + if (bdev_write_zeroes_sectors(bdev)) { + ret = blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, flags); + if (ret != -EOPNOTSUPP) + return ret; } - return ret; + return blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, flags); } EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-map.c b/block/blk-map.c index 71210cdb3442..bce144091128 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -634,7 +634,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, const struct iov_iter *iter, gfp_t gfp_mask) { bool copy = false, map_bvec = false; - unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); + unsigned long align = blk_lim_dma_alignment_and_pad(&q->limits); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; diff --git a/block/blk-merge.c b/block/blk-merge.c index 8534c35e0497..de5281bcadc5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -154,6 +154,19 @@ static struct bio *bio_split_write_zeroes(struct bio *bio, return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); } +static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, + bool is_atomic) +{ + /* + * chunk_sectors must be a multiple of atomic_write_boundary_sectors if + * both non-zero. + */ + if (is_atomic && lim->atomic_write_boundary_sectors) + return lim->atomic_write_boundary_sectors; + + return lim->chunk_sectors; +} + /* * Return the maximum number of sectors from the start of a bio that may be * submitted as a single request to a block device. If enough sectors remain, @@ -167,12 +180,23 @@ static inline unsigned get_max_io_size(struct bio *bio, { unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; - unsigned max_sectors = lim->max_sectors, start, end; + bool is_atomic = bio->bi_opf & REQ_ATOMIC; + unsigned boundary_sectors = blk_boundary_sectors(lim, is_atomic); + unsigned max_sectors, start, end; - if (lim->chunk_sectors) { + /* + * We ignore lim->max_sectors for atomic writes because it may less + * than the actual bio size, which we cannot tolerate. + */ + if (is_atomic) + max_sectors = lim->atomic_write_max_sectors; + else + max_sectors = lim->max_sectors; + + if (boundary_sectors) { max_sectors = min(max_sectors, - blk_chunk_sectors_left(bio->bi_iter.bi_sector, - lim->chunk_sectors)); + blk_boundary_sectors_left(bio->bi_iter.bi_sector, + boundary_sectors)); } start = bio->bi_iter.bi_sector & (pbs - 1); @@ -185,23 +209,22 @@ static inline unsigned get_max_io_size(struct bio *bio, /** * get_max_segment_size() - maximum number of bytes to add as a single segment * @lim: Request queue limits. - * @start_page: See below. - * @offset: Offset from @start_page where to add a segment. + * @paddr: address of the range to add + * @len: maximum length available to add at @paddr * - * Returns the maximum number of bytes that can be added as a single segment. + * Returns the maximum number of bytes of the range starting at @paddr that can + * be added to a single segment. */ static inline unsigned get_max_segment_size(const struct queue_limits *lim, - struct page *start_page, unsigned long offset) + phys_addr_t paddr, unsigned int len) { - unsigned long mask = lim->seg_boundary_mask; - - offset = mask & (page_to_phys(start_page) + offset); - /* * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 * after having calculated the minimum. */ - return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1; + return min_t(unsigned long, len, + min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr), + (unsigned long)lim->max_segment_size - 1) + 1); } /** @@ -234,9 +257,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, unsigned seg_size = 0; while (len && *nsegs < max_segs) { - seg_size = get_max_segment_size(lim, bv->bv_page, - bv->bv_offset + total_len); - seg_size = min(seg_size, len); + seg_size = get_max_segment_size(lim, bvec_phys(bv) + total_len, len); (*nsegs)++; total_len += seg_size; @@ -305,6 +326,11 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, *segs = nsegs; return NULL; split: + if (bio->bi_opf & REQ_ATOMIC) { + bio->bi_status = BLK_STS_INVAL; + bio_endio(bio); + return ERR_PTR(-EINVAL); + } /* * We can't sanely support splitting for a REQ_NOWAIT bio. End it * with EAGAIN if splitting is required and return an error pointer. @@ -465,8 +491,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q, while (nbytes > 0) { unsigned offset = bvec->bv_offset + total; - unsigned len = min(get_max_segment_size(&q->limits, - bvec->bv_page, offset), nbytes); + unsigned len = get_max_segment_size(&q->limits, + bvec_phys(bvec) + total, nbytes); struct page *page = bvec->bv_page; /* @@ -588,18 +614,22 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { struct request_queue *q = rq->q; - unsigned int max_sectors; + struct queue_limits *lim = &q->limits; + unsigned int max_sectors, boundary_sectors; + bool is_atomic = rq->cmd_flags & REQ_ATOMIC; if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; - max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); - if (!q->limits.chunk_sectors || + boundary_sectors = blk_boundary_sectors(lim, is_atomic); + max_sectors = blk_queue_get_max_sectors(rq); + + if (!boundary_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) return max_sectors; return min(max_sectors, - blk_chunk_sectors_left(offset, q->limits.chunk_sectors)); + blk_boundary_sectors_left(offset, boundary_sectors)); } static inline int ll_new_hw_segment(struct request *req, struct bio *bio, @@ -797,6 +827,18 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } +static bool blk_atomic_write_mergeable_rq_bio(struct request *rq, + struct bio *bio) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC); +} + +static bool blk_atomic_write_mergeable_rqs(struct request *rq, + struct request *next) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -820,6 +862,9 @@ static struct request *attempt_merge(struct request_queue *q, if (req->ioprio != next->ioprio) return NULL; + if (!blk_atomic_write_mergeable_rqs(req, next)) + return NULL; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -951,6 +996,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->ioprio != bio_prio(bio)) return false; + if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) + return false; + return true; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 770c0c2b72fa..344f9e503bdb 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -84,28 +84,15 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), - QUEUE_FLAG_NAME(NONROT), - QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), - QUEUE_FLAG_NAME(ADD_RANDOM), - QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), - QUEUE_FLAG_NAME(STABLE_WRITES), - QUEUE_FLAG_NAME(POLL), - QUEUE_FLAG_NAME(WC), - QUEUE_FLAG_NAME(FUA), - QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), - QUEUE_FLAG_NAME(PCI_P2PDMA), - QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), - QUEUE_FLAG_NAME(NOWAIT), QUEUE_FLAG_NAME(SQ_SCHED), - QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 3b4df8e5ac9e..e3c3c0c21b55 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -448,6 +448,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; +retry: + data->ctx = blk_mq_get_ctx(q); + data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is @@ -469,13 +473,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } - } - -retry: - data->ctx = blk_mq_get_ctx(q); - data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->rq_flags & RQF_SCHED_TAGS)) + } else { blk_mq_tag_busy(data->hctx); + } if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; @@ -804,10 +804,8 @@ static void blk_complete_request(struct request *req) if (!bio) return; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) - req->q->integrity.profile->complete_fn(req, total_bytes); -#endif + blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -875,11 +873,9 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) - req->q->integrity.profile->complete_fn(req, nr_bytes); -#endif + blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -1264,10 +1260,9 @@ void blk_mq_start_request(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) - q->integrity.profile->prepare_fn(rq); -#endif + blk_integrity_prepare(rq); + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } @@ -2914,6 +2909,17 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, INIT_LIST_HEAD(&rq->queuelist); } +static bool bio_unaligned(const struct bio *bio, struct request_queue *q) +{ + unsigned int bs_mask = queue_logical_block_size(q) - 1; + + /* .bi_sector of any zero sized bio need to be initialized */ + if ((bio->bi_iter.bi_size & bs_mask) || + ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) + return true; + return false; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2966,6 +2972,15 @@ void blk_mq_submit_bio(struct bio *bio) return; } + /* + * Device reconfiguration may change logical block size, so alignment + * check has to be done with queue usage counter held + */ + if (unlikely(bio_unaligned(bio, q))) { + bio_io_error(bio); + goto queue_exit; + } + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) @@ -3041,7 +3056,7 @@ queue_exit: blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; @@ -4114,6 +4129,12 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } +static bool blk_mq_can_poll(struct blk_mq_tag_set *set) +{ + return set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues; +} + struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { @@ -4121,7 +4142,13 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); + if (!lim) + lim = &default_lim; + lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; + if (blk_mq_can_poll(set)) + lim->features |= BLK_FEAT_POLL; + + q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; @@ -4274,17 +4301,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -static void blk_mq_update_poll_flag(struct request_queue *q) -{ - struct blk_mq_tag_set *set = q->tag_set; - - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); -} - int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -4312,7 +4328,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); @@ -4636,13 +4651,15 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) int ret; unsigned long i; + if (WARN_ON_ONCE(!q->mq_freeze_depth)) + return -EINVAL; + if (!set) return -EINVAL; if (q->nr_requests == nr) return 0; - blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); ret = 0; @@ -4676,7 +4693,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); return ret; } @@ -4798,8 +4814,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { + struct queue_limits lim; + blk_mq_realloc_hw_ctxs(set, q); - blk_mq_update_poll_flag(q); + if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -4811,6 +4829,13 @@ fallback: set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } + lim = queue_limits_start_update(q); + if (blk_mq_can_poll(set)) + lim.features |= BLK_FEAT_POLL; + else + lim.features &= ~BLK_FEAT_POLL; + if (queue_limits_commit_update(q, &lim) < 0) + pr_warn("updating the poll flag failed\n"); blk_mq_map_swqueue(q); } diff --git a/block/blk-settings.c b/block/blk-settings.c index effeb9a639bb..cd8a8eabc9a5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -6,7 +6,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/pagemap.h> #include <linux/backing-dev-defs.h> #include <linux/gcd.h> @@ -55,7 +55,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) } EXPORT_SYMBOL(blk_set_stacking_limits); -static void blk_apply_bdi_limits(struct backing_dev_info *bdi, +void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim) { /* @@ -68,7 +68,7 @@ static void blk_apply_bdi_limits(struct backing_dev_info *bdi, static int blk_validate_zoned_limits(struct queue_limits *lim) { - if (!lim->zoned) { + if (!(lim->features & BLK_FEAT_ZONED)) { if (WARN_ON_ONCE(lim->max_open_zones) || WARN_ON_ONCE(lim->max_active_zones) || WARN_ON_ONCE(lim->zone_write_granularity) || @@ -80,6 +80,14 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) return -EINVAL; + /* + * Given that active zones include open zones, the maximum number of + * open zones cannot be larger than the maximum number of active zones. + */ + if (lim->max_active_zones && + lim->max_open_zones > lim->max_active_zones) + return -EINVAL; + if (lim->zone_write_granularity < lim->logical_block_size) lim->zone_write_granularity = lim->logical_block_size; @@ -97,6 +105,120 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) return 0; } +static int blk_validate_integrity_limits(struct queue_limits *lim) +{ + struct blk_integrity *bi = &lim->integrity; + + if (!bi->tuple_size) { + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE || + bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) { + pr_warn("invalid PI settings.\n"); + return -EINVAL; + } + return 0; + } + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { + pr_warn("integrity support disabled.\n"); + return -EINVAL; + } + + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE && + (bi->flags & BLK_INTEGRITY_REF_TAG)) { + pr_warn("ref tag not support without checksum.\n"); + return -EINVAL; + } + + if (!bi->interval_exp) + bi->interval_exp = ilog2(lim->logical_block_size); + + return 0; +} + +/* + * Returns max guaranteed bytes which we can fit in a bio. + * + * We request that an atomic_write is ITER_UBUF iov_iter (so a single vector), + * so we assume that we can fit in at least PAGE_SIZE in a segment, apart from + * the first and last segments. + */ +static unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim) +{ + unsigned int max_segments = min(BIO_MAX_VECS, lim->max_segments); + unsigned int length; + + length = min(max_segments, 2) * lim->logical_block_size; + if (max_segments > 2) + length += (max_segments - 2) * PAGE_SIZE; + + return length; +} + +static void blk_atomic_writes_update_limits(struct queue_limits *lim) +{ + unsigned int unit_limit = min(lim->max_hw_sectors << SECTOR_SHIFT, + blk_queue_max_guaranteed_bio(lim)); + + unit_limit = rounddown_pow_of_two(unit_limit); + + lim->atomic_write_max_sectors = + min(lim->atomic_write_hw_max >> SECTOR_SHIFT, + lim->max_hw_sectors); + lim->atomic_write_unit_min = + min(lim->atomic_write_hw_unit_min, unit_limit); + lim->atomic_write_unit_max = + min(lim->atomic_write_hw_unit_max, unit_limit); + lim->atomic_write_boundary_sectors = + lim->atomic_write_hw_boundary >> SECTOR_SHIFT; +} + +static void blk_validate_atomic_write_limits(struct queue_limits *lim) +{ + unsigned int boundary_sectors; + + if (!lim->atomic_write_hw_max) + goto unsupported; + + boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; + + if (boundary_sectors) { + /* + * A feature of boundary support is that it disallows bios to + * be merged which would result in a merged request which + * crosses either a chunk sector or atomic write HW boundary, + * even though chunk sectors may be just set for performance. + * For simplicity, disallow atomic writes for a chunk sector + * which is non-zero and smaller than atomic write HW boundary. + * Furthermore, chunk sectors must be a multiple of atomic + * write HW boundary. Otherwise boundary support becomes + * complicated. + * Devices which do not conform to these rules can be dealt + * with if and when they show up. + */ + if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors)) + goto unsupported; + + /* + * The boundary size just needs to be a multiple of unit_max + * (and not necessarily a power-of-2), so this following check + * could be relaxed in future. + * Furthermore, if needed, unit_max could even be reduced so + * that it is compliant with a !power-of-2 boundary. + */ + if (!is_power_of_2(boundary_sectors)) + goto unsupported; + } + + blk_atomic_writes_update_limits(lim); + return; + +unsupported: + lim->atomic_write_max_sectors = 0; + lim->atomic_write_boundary_sectors = 0; + lim->atomic_write_unit_min = 0; + lim->atomic_write_unit_max = 0; +} + /* * Check that the limits in lim are valid, initialize defaults for unset * values, and cap values based on others where needed. @@ -105,6 +227,7 @@ static int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; + int err; /* * Unless otherwise specified, default to 512 byte logical blocks and a @@ -112,6 +235,10 @@ static int blk_validate_limits(struct queue_limits *lim) */ if (!lim->logical_block_size) lim->logical_block_size = SECTOR_SIZE; + else if (blk_validate_block_size(lim->logical_block_size)) { + pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size); + return -EINVAL; + } if (lim->physical_block_size < lim->logical_block_size) lim->physical_block_size = lim->logical_block_size; @@ -153,6 +280,12 @@ static int blk_validate_limits(struct queue_limits *lim) if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); + } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { + lim->max_sectors = + min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT); + } else if (lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { + lim->max_sectors = + min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT); } else { lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); } @@ -220,9 +353,17 @@ static int blk_validate_limits(struct queue_limits *lim) if (lim->alignment_offset) { lim->alignment_offset &= (lim->physical_block_size - 1); - lim->misaligned = 0; + lim->flags &= ~BLK_FLAG_MISALIGNED; } + if (!(lim->features & BLK_FEAT_WRITE_CACHE)) + lim->features &= ~BLK_FEAT_FUA; + + blk_validate_atomic_write_limits(lim); + + err = blk_validate_integrity_limits(lim); + if (err) + return err; return blk_validate_zoned_limits(lim); } @@ -254,15 +395,25 @@ int blk_set_default_limits(struct queue_limits *lim) */ int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim) - __releases(q->limits_lock) { - int error = blk_validate_limits(lim); + int error; - if (!error) { - q->limits = *lim; - if (q->disk) - blk_apply_bdi_limits(q->disk->bdi, lim); + error = blk_validate_limits(lim); + if (error) + goto out_unlock; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (q->crypto_profile && lim->integrity.tag_size) { + pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together.\n"); + error = -EINVAL; + goto out_unlock; } +#endif + + q->limits = *lim; + if (q->disk) + blk_apply_bdi_limits(q->disk->bdi, lim); +out_unlock: mutex_unlock(&q->limits_lock); return error; } @@ -287,204 +438,6 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) EXPORT_SYMBOL_GPL(queue_limits_set); /** - * blk_queue_chunk_sectors - set size of the chunk for this queue - * @q: the request queue for the device - * @chunk_sectors: chunk sectors in the usual 512b unit - * - * Description: - * If a driver doesn't want IOs to cross a given chunk size, it can set - * this limit and prevent merging across chunks. Note that the block layer - * must accept a page worth of data at any offset. So if the crossing of - * chunks is a hard limitation in the driver, it must still be prepared - * to split single page bios. - **/ -void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) -{ - q->limits.chunk_sectors = chunk_sectors; -} -EXPORT_SYMBOL(blk_queue_chunk_sectors); - -/** - * blk_queue_max_discard_sectors - set max sectors for a single discard - * @q: the request queue for the device - * @max_discard_sectors: maximum number of sectors to discard - **/ -void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors) -{ - struct queue_limits *lim = &q->limits; - - lim->max_hw_discard_sectors = max_discard_sectors; - lim->max_discard_sectors = - min(max_discard_sectors, lim->max_user_discard_sectors); -} -EXPORT_SYMBOL(blk_queue_max_discard_sectors); - -/** - * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase - * @q: the request queue for the device - * @max_sectors: maximum number of sectors to secure_erase - **/ -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors) -{ - q->limits.max_secure_erase_sectors = max_sectors; -} -EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); - -/** - * blk_queue_max_write_zeroes_sectors - set max sectors for a single - * write zeroes - * @q: the request queue for the device - * @max_write_zeroes_sectors: maximum number of sectors to write per command - **/ -void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_zeroes_sectors) -{ - q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; -} -EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); - -/** - * blk_queue_max_zone_append_sectors - set max sectors for a single zone append - * @q: the request queue for the device - * @max_zone_append_sectors: maximum number of sectors to write per command - * - * Sets the maximum number of sectors allowed for zone append commands. If - * Specifying 0 for @max_zone_append_sectors indicates that the queue does - * not natively support zone append operations and that the block layer must - * emulate these operations using regular writes. - **/ -void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors) -{ - unsigned int max_sectors = 0; - - if (WARN_ON(!blk_queue_is_zoned(q))) - return; - - if (max_zone_append_sectors) { - max_sectors = min(q->limits.max_hw_sectors, - max_zone_append_sectors); - max_sectors = min(q->limits.chunk_sectors, max_sectors); - - /* - * Signal eventual driver bugs resulting in the max_zone_append - * sectors limit being 0 due to the chunk_sectors limit (zone - * size) not set or the max_hw_sectors limit not set. - */ - WARN_ON_ONCE(!max_sectors); - } - - q->limits.max_zone_append_sectors = max_sectors; -} -EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); - -/** - * blk_queue_logical_block_size - set logical block size for the queue - * @q: the request queue for the device - * @size: the logical block size, in bytes - * - * Description: - * This should be set to the lowest possible block size that the - * storage device can address. The default of 512 covers most - * hardware. - **/ -void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) -{ - struct queue_limits *limits = &q->limits; - - limits->logical_block_size = size; - - if (limits->discard_granularity < limits->logical_block_size) - limits->discard_granularity = limits->logical_block_size; - - if (limits->physical_block_size < size) - limits->physical_block_size = size; - - if (limits->io_min < limits->physical_block_size) - limits->io_min = limits->physical_block_size; - - limits->max_hw_sectors = - round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); - limits->max_sectors = - round_down(limits->max_sectors, size >> SECTOR_SHIFT); -} -EXPORT_SYMBOL(blk_queue_logical_block_size); - -/** - * blk_queue_physical_block_size - set physical block size for the queue - * @q: the request queue for the device - * @size: the physical block size, in bytes - * - * Description: - * This should be set to the lowest possible sector size that the - * hardware can operate on without reverting to read-modify-write - * operations. - */ -void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) -{ - q->limits.physical_block_size = size; - - if (q->limits.physical_block_size < q->limits.logical_block_size) - q->limits.physical_block_size = q->limits.logical_block_size; - - if (q->limits.discard_granularity < q->limits.physical_block_size) - q->limits.discard_granularity = q->limits.physical_block_size; - - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; -} -EXPORT_SYMBOL(blk_queue_physical_block_size); - -/** - * blk_queue_zone_write_granularity - set zone write granularity for the queue - * @q: the request queue for the zoned device - * @size: the zone write granularity size, in bytes - * - * Description: - * This should be set to the lowest possible size allowing to write in - * sequential zones of a zoned block device. - */ -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size) -{ - if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) - return; - - q->limits.zone_write_granularity = size; - - if (q->limits.zone_write_granularity < q->limits.logical_block_size) - q->limits.zone_write_granularity = q->limits.logical_block_size; -} -EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); - -/** - * blk_queue_alignment_offset - set physical block alignment offset - * @q: the request queue for the device - * @offset: alignment offset in bytes - * - * Description: - * Some devices are naturally misaligned to compensate for things like - * the legacy DOS partition table 63-sector offset. Low-level drivers - * should call this function for devices whose first sector is not - * naturally aligned. - */ -void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) -{ - q->limits.alignment_offset = - offset & (q->limits.physical_block_size - 1); - q->limits.misaligned = 0; -} -EXPORT_SYMBOL(blk_queue_alignment_offset); - -void disk_update_readahead(struct gendisk *disk) -{ - blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); -} -EXPORT_SYMBOL_GPL(disk_update_readahead); - -/** * blk_limits_io_min - set minimum request size for a device * @limits: the queue limits * @min: smallest I/O size in bytes @@ -508,26 +461,6 @@ void blk_limits_io_min(struct queue_limits *limits, unsigned int min) EXPORT_SYMBOL(blk_limits_io_min); /** - * blk_queue_io_min - set minimum request size for the queue - * @q: the request queue for the device - * @min: smallest I/O size in bytes - * - * Description: - * Storage devices may report a granularity or preferred minimum I/O - * size which is the smallest request the device can perform without - * incurring a performance penalty. For disk drives this is often the - * physical block size. For RAID arrays it is often the stripe chunk - * size. A properly aligned multiple of minimum_io_size is the - * preferred request size for workloads where a high number of I/O - * operations is desired. - */ -void blk_queue_io_min(struct request_queue *q, unsigned int min) -{ - blk_limits_io_min(&q->limits, min); -} -EXPORT_SYMBOL(blk_queue_io_min); - -/** * blk_limits_io_opt - set optimal request size for a device * @limits: the queue limits * @opt: smallest I/O size in bytes @@ -614,6 +547,21 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, { unsigned int top, bottom, alignment, ret = 0; + t->features |= (b->features & BLK_FEAT_INHERIT_MASK); + + /* + * BLK_FEAT_NOWAIT and BLK_FEAT_POLL need to be supported both by the + * stacking driver and all underlying devices. The stacking driver sets + * the flags before stacking the limits, and this will clear the flags + * if any of the underlying devices does not support it. + */ + if (!(b->features & BLK_FEAT_NOWAIT)) + t->features &= ~BLK_FEAT_NOWAIT; + if (!(b->features & BLK_FEAT_POLL)) + t->features &= ~BLK_FEAT_POLL; + + t->flags |= (b->flags & BLK_FLAG_MISALIGNED); + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_user_sectors = min_not_zero(t->max_user_sectors, b->max_user_sectors); @@ -623,7 +571,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), queue_limits_max_zone_append_sectors(b)); - t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); @@ -639,8 +586,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); - t->misaligned |= b->misaligned; - alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is @@ -654,7 +599,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Verify that top and bottom intervals line up */ if (max(top, bottom) % min(top, bottom)) { - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } } @@ -676,42 +621,38 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { t->physical_block_size = t->logical_block_size; - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Minimum I/O a multiple of the physical block size? */ if (t->io_min & (t->physical_block_size - 1)) { t->io_min = t->physical_block_size; - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Optimal I/O a multiple of the physical block size? */ if (t->io_opt & (t->physical_block_size - 1)) { t->io_opt = 0; - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* chunk_sectors a multiple of the physical block size? */ if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { t->chunk_sectors = 0; - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } - t->raid_partial_stripes_expensive = - max(t->raid_partial_stripes_expensive, - b->raid_partial_stripes_expensive); - /* Find lowest common alignment_offset */ t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment) % max(t->physical_block_size, t->io_min); /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } @@ -723,16 +664,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, if (b->discard_granularity) { alignment = queue_limit_discard_alignment(b, start); - if (t->discard_granularity != 0 && - t->discard_alignment != alignment) { - top = t->discard_granularity + t->discard_alignment; - bottom = b->discard_granularity + alignment; - - /* Verify that top and bottom intervals line up */ - if ((max(top, bottom) % min(top, bottom)) != 0) - t->discard_misaligned = 1; - } - t->max_discard_sectors = min_not_zero(t->max_discard_sectors, b->max_discard_sectors); t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, @@ -746,8 +677,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_secure_erase_sectors); t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); - t->zoned = max(t->zoned, b->zoned); - if (!t->zoned) { + if (!(t->features & BLK_FEAT_ZONED)) { t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } @@ -781,21 +711,65 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); /** - * blk_queue_update_dma_pad - update pad mask - * @q: the request queue for the device - * @mask: pad mask + * queue_limits_stack_integrity - stack integrity profile + * @t: target queue limits + * @b: base queue limits * - * Update dma pad mask. + * Check if the integrity profile in the @b can be stacked into the + * target @t. Stacking is possible if either: * - * Appending pad buffer to a request modifies the last entry of a - * scatter list such that it includes the pad buffer. - **/ -void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) -{ - if (mask > q->dma_pad_mask) - q->dma_pad_mask = mask; + * a) does not have any integrity information stacked into it yet + * b) the integrity profile in @b is identical to the one in @t + * + * If @b can be stacked into @t, return %true. Else return %false and clear the + * integrity information in @t. + */ +bool queue_limits_stack_integrity(struct queue_limits *t, + struct queue_limits *b) +{ + struct blk_integrity *ti = &t->integrity; + struct blk_integrity *bi = &b->integrity; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return true; + + if (!ti->tuple_size) { + /* inherit the settings from the first underlying device */ + if (!(ti->flags & BLK_INTEGRITY_STACKED)) { + ti->flags = BLK_INTEGRITY_DEVICE_CAPABLE | + (bi->flags & BLK_INTEGRITY_REF_TAG); + ti->csum_type = bi->csum_type; + ti->tuple_size = bi->tuple_size; + ti->pi_offset = bi->pi_offset; + ti->interval_exp = bi->interval_exp; + ti->tag_size = bi->tag_size; + goto done; + } + if (!bi->tuple_size) + goto done; + } + + if (ti->tuple_size != bi->tuple_size) + goto incompatible; + if (ti->interval_exp != bi->interval_exp) + goto incompatible; + if (ti->tag_size != bi->tag_size) + goto incompatible; + if (ti->csum_type != bi->csum_type) + goto incompatible; + if ((ti->flags & BLK_INTEGRITY_REF_TAG) != + (bi->flags & BLK_INTEGRITY_REF_TAG)) + goto incompatible; + +done: + ti->flags |= BLK_INTEGRITY_STACKED; + return true; + +incompatible: + memset(ti, 0, sizeof(*ti)); + return false; } -EXPORT_SYMBOL(blk_queue_update_dma_pad); +EXPORT_SYMBOL_GPL(queue_limits_stack_integrity); /** * blk_set_queue_depth - tell the block layer about the device queue depth @@ -810,54 +784,11 @@ void blk_set_queue_depth(struct request_queue *q, unsigned int depth) } EXPORT_SYMBOL(blk_set_queue_depth); -/** - * blk_queue_write_cache - configure queue's write cache - * @q: the request queue for the device - * @wc: write back cache on or off - * @fua: device supports FUA writes, if true - * - * Tell the block layer about the write cache of @q. - */ -void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) -{ - if (wc) { - blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); - blk_queue_flag_set(QUEUE_FLAG_WC, q); - } else { - blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); - blk_queue_flag_clear(QUEUE_FLAG_WC, q); - } - if (fua) - blk_queue_flag_set(QUEUE_FLAG_FUA, q); - else - blk_queue_flag_clear(QUEUE_FLAG_FUA, q); -} -EXPORT_SYMBOL_GPL(blk_queue_write_cache); - -/** - * disk_set_zoned - inidicate a zoned device - * @disk: gendisk to configure - */ -void disk_set_zoned(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - - WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); - - /* - * Set the zone write granularity to the device logical block - * size by default. The driver can change this value if needed. - */ - q->limits.zoned = true; - blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); -} -EXPORT_SYMBOL_GPL(disk_set_zoned); - int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - if (q->limits.misaligned) + if (q->limits.flags & BLK_FLAG_MISALIGNED) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f0f9314ab65c..60116d13cb80 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -22,8 +22,8 @@ struct queue_sysfs_entry { struct attribute attr; - ssize_t (*show)(struct request_queue *, char *); - ssize_t (*store)(struct request_queue *, const char *, size_t); + ssize_t (*show)(struct gendisk *disk, char *page); + ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); }; static ssize_t @@ -47,18 +47,18 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t queue_requests_show(struct request_queue *q, char *page) +static ssize_t queue_requests_show(struct gendisk *disk, char *page) { - return queue_var_show(q->nr_requests, page); + return queue_var_show(disk->queue->nr_requests, page); } static ssize_t -queue_requests_store(struct request_queue *q, const char *page, size_t count) +queue_requests_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nr; int ret, err; - if (!queue_is_mq(q)) + if (!queue_is_mq(disk->queue)) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -68,111 +68,91 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - err = blk_mq_update_nr_requests(q, nr); + err = blk_mq_update_nr_requests(disk->queue, nr); if (err) return err; return ret; } -static ssize_t queue_ra_show(struct request_queue *q, char *page) +static ssize_t queue_ra_show(struct gendisk *disk, char *page) { - unsigned long ra_kb; - - if (!q->disk) - return -EINVAL; - ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); - return queue_var_show(ra_kb, page); + return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); } static ssize_t -queue_ra_store(struct request_queue *q, const char *page, size_t count) +queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; - if (!q->disk) - return -EINVAL; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } -static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) -{ - int max_sectors_kb = queue_max_sectors(q) >> 1; - - return queue_var_show(max_sectors_kb, page); -} - -static ssize_t queue_max_segments_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_max_segments(q), page); -} - -static ssize_t queue_max_discard_segments_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_max_discard_segments(q), page); -} - -static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.max_integrity_segments, page); -} - -static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_max_segment_size(q), page); -} - -static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_logical_block_size(q), page); -} - -static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_physical_block_size(q), page); -} - -static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.chunk_sectors, page); -} - -static ssize_t queue_io_min_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_io_min(q), page); +#define QUEUE_SYSFS_LIMIT_SHOW(_field) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ +{ \ + return queue_var_show(disk->queue->limits._field, page); \ +} + +QUEUE_SYSFS_LIMIT_SHOW(max_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) +QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) +QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) +QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) +QUEUE_SYSFS_LIMIT_SHOW(io_min) +QUEUE_SYSFS_LIMIT_SHOW(io_opt) +QUEUE_SYSFS_LIMIT_SHOW(discard_granularity) +QUEUE_SYSFS_LIMIT_SHOW(zone_write_granularity) +QUEUE_SYSFS_LIMIT_SHOW(virt_boundary_mask) +QUEUE_SYSFS_LIMIT_SHOW(dma_alignment) +QUEUE_SYSFS_LIMIT_SHOW(max_open_zones) +QUEUE_SYSFS_LIMIT_SHOW(max_active_zones) +QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min) +QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) + +#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ +{ \ + return sprintf(page, "%llu\n", \ + (unsigned long long)disk->queue->limits._field << \ + SECTOR_SHIFT); \ } -static ssize_t queue_io_opt_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_io_opt(q), page); -} +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) -static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.discard_granularity, page); +#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ +{ \ + return queue_var_show(disk->queue->limits._field >> 1, page); \ } -static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) -{ +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_hw_discard_sectors << 9); +#define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ +{ \ + return sprintf(page, "%d\n", _val); \ } -static ssize_t queue_discard_max_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_discard_sectors << 9); -} +/* deprecated fields */ +QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) +QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) +QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) -static ssize_t queue_discard_max_store(struct request_queue *q, - const char *page, size_t count) +static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, + const char *page, size_t count) { unsigned long max_discard_bytes; struct queue_limits lim; @@ -183,54 +163,34 @@ static ssize_t queue_discard_max_store(struct request_queue *q, if (ret < 0) return ret; - if (max_discard_bytes & (q->limits.discard_granularity - 1)) + if (max_discard_bytes & (disk->queue->limits.discard_granularity - 1)) return -EINVAL; if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - blk_mq_freeze_queue(q); - lim = queue_limits_start_update(q); + lim = queue_limits_start_update(disk->queue); lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; - err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); - + err = queue_limits_commit_update(disk->queue, &lim); if (err) return err; return ret; } -static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) -{ - return queue_var_show(0, page); -} - -static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) -{ - return queue_var_show(0, page); -} - -static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) +/* + * For zone append queue_max_zone_append_sectors does not just return the + * underlying queue limits, but actually contains a calculation. Because of + * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here. + */ +static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page) { return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_write_zeroes_sectors << 9); -} - -static ssize_t queue_zone_write_granularity_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_zone_write_granularity(q), page); -} - -static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) -{ - unsigned long long max_sectors = queue_max_zone_append_sectors(q); - - return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); + (u64)queue_max_zone_append_sectors(disk->queue) << + SECTOR_SHIFT); } static ssize_t -queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) +queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count) { unsigned long max_sectors_kb; struct queue_limits lim; @@ -241,94 +201,83 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; - blk_mq_freeze_queue(q); - lim = queue_limits_start_update(q); + lim = queue_limits_start_update(disk->queue); lim.max_user_sectors = max_sectors_kb << 1; - err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); + err = queue_limits_commit_update(disk->queue, &lim); if (err) return err; return ret; } -static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) +static ssize_t queue_feature_store(struct gendisk *disk, const char *page, + size_t count, blk_features_t feature) { - int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; - - return queue_var_show(max_hw_sectors_kb, page); -} + struct queue_limits lim; + unsigned long val; + ssize_t ret; -static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.virt_boundary_mask, page); -} + ret = queue_var_store(&val, page, count); + if (ret < 0) + return ret; -static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_dma_alignment(q), page); + lim = queue_limits_start_update(disk->queue); + if (val) + lim.features |= feature; + else + lim.features &= ~feature; + ret = queue_limits_commit_update(disk->queue, &lim); + if (ret) + return ret; + return count; } -#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ -static ssize_t \ -queue_##name##_show(struct request_queue *q, char *page) \ +#define QUEUE_SYSFS_FEATURE(_name, _feature) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - int bit; \ - bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \ - return queue_var_show(neg ? !bit : bit, page); \ + return sprintf(page, "%u\n", \ + !!(disk->queue->limits.features & _feature)); \ } \ -static ssize_t \ -queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ +static ssize_t queue_##_name##_store(struct gendisk *disk, \ + const char *page, size_t count) \ { \ - unsigned long val; \ - ssize_t ret; \ - ret = queue_var_store(&val, page, count); \ - if (ret < 0) \ - return ret; \ - if (neg) \ - val = !val; \ - \ - if (val) \ - blk_queue_flag_set(QUEUE_FLAG_##flag, q); \ - else \ - blk_queue_flag_clear(QUEUE_FLAG_##flag, q); \ - return ret; \ -} - -QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); -QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); -QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); -QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); -#undef QUEUE_SYSFS_BIT_FNS - -static ssize_t queue_zoned_show(struct request_queue *q, char *page) -{ - if (blk_queue_is_zoned(q)) - return sprintf(page, "host-managed\n"); - return sprintf(page, "none\n"); + return queue_feature_store(disk, page, count, _feature); \ } -static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) -{ - return queue_var_show(disk_nr_zones(q->disk), page); +QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) +QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) +QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) +QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); + +#define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ +{ \ + return sprintf(page, "%u\n", \ + !!(disk->queue->limits.features & _feature)); \ } -static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) +QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL); +QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); +QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); + +static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { - return queue_var_show(bdev_max_open_zones(q->disk->part0), page); + if (blk_queue_is_zoned(disk->queue)) + return sprintf(page, "host-managed\n"); + return sprintf(page, "none\n"); } -static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) +static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { - return queue_var_show(bdev_max_active_zones(q->disk->part0), page); + return queue_var_show(disk_nr_zones(disk), page); } -static ssize_t queue_nomerges_show(struct request_queue *q, char *page) +static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { - return queue_var_show((blk_queue_nomerges(q) << 1) | - blk_queue_noxmerges(q), page); + return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | + blk_queue_noxmerges(disk->queue), page); } -static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, +static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; @@ -337,29 +286,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, if (ret < 0) return ret; - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); - blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, disk->queue); if (nm == 2) - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, disk->queue); else if (nm) - blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, disk->queue); return ret; } -static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) +static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { - bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); - bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); + bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); return queue_var_show(set << force, page); } static ssize_t -queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) +queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret = -EINVAL; #ifdef CONFIG_SMP + struct request_queue *q = disk->queue; unsigned long val; ret = queue_var_store(&val, page, count); @@ -380,38 +330,28 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } -static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%d\n", -1); -} - -static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, +static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, size_t count) { return count; } -static ssize_t queue_poll_show(struct request_queue *q, char *page) -{ - return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); -} - -static ssize_t queue_poll_store(struct request_queue *q, const char *page, +static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!(disk->queue->limits.features & BLK_FEAT_POLL)) return -EINVAL; pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); return count; } -static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) +static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout)); + return sprintf(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout)); } -static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, +static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { unsigned int val; @@ -421,46 +361,45 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, if (err || val == 0) return -EINVAL; - blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + blk_queue_rq_timeout(disk->queue, msecs_to_jiffies(val)); return count; } -static ssize_t queue_wc_show(struct request_queue *q, char *page) +static ssize_t queue_wc_show(struct gendisk *disk, char *page) { - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (blk_queue_write_cache(disk->queue)) return sprintf(page, "write back\n"); - return sprintf(page, "write through\n"); } -static ssize_t queue_wc_store(struct request_queue *q, const char *page, +static ssize_t queue_wc_store(struct gendisk *disk, const char *page, size_t count) { + struct queue_limits lim; + bool disable; + int err; + if (!strncmp(page, "write back", 10)) { - if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) - return -EINVAL; - blk_queue_flag_set(QUEUE_FLAG_WC, q); + disable = false; } else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) { - blk_queue_flag_clear(QUEUE_FLAG_WC, q); + !strncmp(page, "none", 4)) { + disable = true; } else { return -EINVAL; } + lim = queue_limits_start_update(disk->queue); + if (disable) + lim.flags |= BLK_FLAG_WRITE_CACHE_DISABLED; + else + lim.flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; + err = queue_limits_commit_update(disk->queue, &lim); + if (err) + return err; return count; } -static ssize_t queue_fua_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags)); -} - -static ssize_t queue_dax_show(struct request_queue *q, char *page) -{ - return queue_var_show(blk_queue_dax(q), page); -} - #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ @@ -491,12 +430,18 @@ QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); -QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); -QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); +QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); +QUEUE_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); +QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_boundary_sectors, + "atomic_write_boundary_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); + QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); -QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); +QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); @@ -522,9 +467,9 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .show = queue_logical_block_size_show, }; -QUEUE_RW_ENTRY(queue_nonrot, "rotational"); +QUEUE_RW_ENTRY(queue_rotational, "rotational"); QUEUE_RW_ENTRY(queue_iostats, "iostats"); -QUEUE_RW_ENTRY(queue_random, "add_random"); +QUEUE_RW_ENTRY(queue_add_random, "add_random"); QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT @@ -541,20 +486,22 @@ static ssize_t queue_var_store64(s64 *var, const char *page) return 0; } -static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { - if (!wbt_rq_qos(q)) + if (!wbt_rq_qos(disk->queue)) return -EINVAL; - if (wbt_disabled(q)) + if (wbt_disabled(disk->queue)) return sprintf(page, "0\n"); - return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); + return sprintf(page, "%llu\n", + div_u64(wbt_get_min_lat(disk->queue), 1000)); } -static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, +static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { + struct request_queue *q = disk->queue; struct rq_qos *rqos; ssize_t ret; s64 val; @@ -567,7 +514,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, rqos = wbt_rq_qos(q); if (!rqos) { - ret = wbt_init(q->disk); + ret = wbt_init(disk); if (ret) return ret; } @@ -585,13 +532,11 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ - blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); return count; } @@ -615,14 +560,18 @@ static struct attribute *queue_attrs[] = { &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, - &queue_discard_max_entry.attr, - &queue_discard_max_hw_entry.attr, + &queue_max_discard_sectors_entry.attr, + &queue_max_hw_discard_sectors_entry.attr, &queue_discard_zeroes_data_entry.attr, + &queue_atomic_write_max_sectors_entry.attr, + &queue_atomic_write_boundary_sectors_entry.attr, + &queue_atomic_write_unit_min_entry.attr, + &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, - &queue_write_zeroes_max_entry.attr, + &queue_max_write_zeroes_sectors_entry.attr, &queue_zone_append_max_entry.attr, &queue_zone_write_granularity_entry.attr, - &queue_nonrot_entry.attr, + &queue_rotational_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, @@ -630,7 +579,7 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, - &queue_random_entry.attr, + &queue_add_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, @@ -699,14 +648,13 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); - struct request_queue *q = disk->queue; ssize_t res; if (!entry->show) return -EIO; - mutex_lock(&q->sysfs_lock); - res = entry->show(q, page); - mutex_unlock(&q->sysfs_lock); + mutex_lock(&disk->queue->sysfs_lock); + res = entry->show(disk, page); + mutex_unlock(&disk->queue->sysfs_lock); return res; } @@ -722,9 +670,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (!entry->store) return -EIO; + blk_mq_freeze_queue(q); mutex_lock(&q->sysfs_lock); - res = entry->store(q, page, length); + res = entry->store(disk, page, length); mutex_unlock(&q->sysfs_lock); + blk_mq_unfreeze_queue(q); return res; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c1bf73f8c75d..dc6140fa3de0 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -704,6 +704,9 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; + + /* make sure at least one io can be dispatched after waiting */ + jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait; } diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 64472134dd26..6dfc659d22e2 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -37,7 +37,7 @@ enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ - WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_SWAP = 4, /* write, from swap_writepage() */ WBT_DISCARD = 8, /* discard */ WBT_NR_BITS = 4, /* number of bits */ @@ -45,7 +45,7 @@ enum wbt_flags { enum { WBT_RWQ_BG = 0, - WBT_RWQ_KSWAPD, + WBT_RWQ_SWAP, WBT_RWQ_DISCARD, WBT_NUM_RWQ, }; @@ -172,8 +172,8 @@ static bool wb_recent_wait(struct rq_wb *rwb) static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, enum wbt_flags wb_acct) { - if (wb_acct & WBT_KSWAPD) - return &rwb->rq_wait[WBT_RWQ_KSWAPD]; + if (wb_acct & WBT_SWAP) + return &rwb->rq_wait[WBT_RWQ_SWAP]; else if (wb_acct & WBT_DISCARD) return &rwb->rq_wait[WBT_RWQ_DISCARD]; @@ -206,8 +206,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; - else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) && - !wb_recent_wait(rwb)) + else if (blk_queue_write_cache(rwb->rqos.disk->queue) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -528,7 +528,7 @@ static bool close_io(struct rq_wb *rwb) time_before(now, rwb->last_comp + HZ / 10); } -#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP) static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { @@ -539,13 +539,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) /* * At this point we know it's a buffered write. If this is - * kswapd trying to free memory, or REQ_SYNC is set, then + * swap trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ - if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb)) limit = rwb->rq_depth.max_depth; else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* @@ -622,8 +622,8 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; } else if (wbt_should_throttle(bio)) { - if (current_is_kswapd()) - flags |= WBT_KSWAPD; + if (bio->bi_opf & REQ_SWAP) + flags |= WBT_SWAP; if (bio_op(bio) == REQ_OP_DISCARD) flags |= WBT_DISCARD; flags |= WBT_TRACKED; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 08d7dfe8bd93..af19296fa50d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -116,24 +116,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) EXPORT_SYMBOL_GPL(blk_zone_cond_str); /** - * bdev_nr_zones - Get number of zones - * @bdev: Target device - * - * Return the total number of zones of a zoned block device. For a block - * device without zone capabilities, the number of zones is always 0. - */ -unsigned int bdev_nr_zones(struct block_device *bdev) -{ - sector_t zone_sectors = bdev_zone_sectors(bdev); - - if (!bdev_is_zoned(bdev)) - return 0; - return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> - ilog2(zone_sectors); -} -EXPORT_SYMBOL_GPL(bdev_nr_zones); - -/** * blkdev_report_zones - Get zones information * @bdev: Target block device * @sector: Sector from which to report zones @@ -168,77 +150,6 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_report_zones); -static inline unsigned long *blk_alloc_zone_bitmap(int node, - unsigned int nr_zones) -{ - return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), - GFP_NOIO, node); -} - -static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - /* - * For an all-zones reset, ignore conventional, empty, read-only - * and offline zones. - */ - switch (zone->cond) { - case BLK_ZONE_COND_NOT_WP: - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_READONLY: - case BLK_ZONE_COND_OFFLINE: - return 0; - default: - set_bit(idx, (unsigned long *)data); - return 0; - } -} - -static int blkdev_zone_reset_all_emulated(struct block_device *bdev) -{ - struct gendisk *disk = bdev->bd_disk; - sector_t capacity = bdev_nr_sectors(bdev); - sector_t zone_sectors = bdev_zone_sectors(bdev); - unsigned long *need_reset; - struct bio *bio = NULL; - sector_t sector = 0; - int ret; - - need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); - if (!need_reset) - return -ENOMEM; - - ret = disk->fops->report_zones(disk, 0, disk->nr_zones, - blk_zone_need_reset_cb, need_reset); - if (ret < 0) - goto out_free_need_reset; - - ret = 0; - while (sector < capacity) { - if (!test_bit(disk_zone_no(disk, sector), need_reset)) { - sector += zone_sectors; - continue; - } - - bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, - GFP_KERNEL); - bio->bi_iter.bi_sector = sector; - sector += zone_sectors; - - /* This may take a while, so be nice to others */ - cond_resched(); - } - - if (bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - -out_free_need_reset: - kfree(need_reset); - return ret; -} - static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; @@ -265,7 +176,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev) int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sector, sector_t nr_sectors) { - struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); sector_t capacity = bdev_nr_sectors(bdev); sector_t end_sector = sector + nr_sectors; @@ -293,16 +203,11 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, return -EINVAL; /* - * In the case of a zone reset operation over all zones, - * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this - * command. For other devices, we emulate this command behavior by - * identifying the zones needing a reset. + * In the case of a zone reset operation over all zones, use + * REQ_OP_ZONE_RESET_ALL. */ - if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { - if (!blk_queue_zone_resetall(q)) - return blkdev_zone_reset_all_emulated(bdev); + if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) return blkdev_zone_reset_all(bdev); - } while (sector < end_sector) { bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); @@ -1573,7 +1478,7 @@ void disk_free_zone_resources(struct gendisk *disk) mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; - kfree(disk->conv_zones_bitmap); + bitmap_free(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; disk->zone_capacity = 0; disk->last_zone_capacity = 0; @@ -1650,8 +1555,22 @@ static int disk_update_zone_resources(struct gendisk *disk, return -ENODEV; } + lim = queue_limits_start_update(q); + + /* + * Some devices can advertize zone resource limits that are larger than + * the number of sequential zones of the zoned block device, e.g. a + * small ZNS namespace. For such case, assume that the zoned device has + * no zone resource limits. + */ + nr_seq_zones = disk->nr_zones - nr_conv_zones; + if (lim.max_open_zones >= nr_seq_zones) + lim.max_open_zones = 0; + if (lim.max_active_zones >= nr_seq_zones) + lim.max_active_zones = 0; + if (!disk->zone_wplugs_pool) - return 0; + goto commit; /* * If the device has no limit on the maximum number of open and active @@ -1660,9 +1579,6 @@ static int disk_update_zone_resources(struct gendisk *disk, * dynamic zone write plug allocation when simultaneously writing to * more zones than the size of the mempool. */ - lim = queue_limits_start_update(q); - - nr_seq_zones = disk->nr_zones - nr_conv_zones; pool_size = max(lim.max_open_zones, lim.max_active_zones); if (!pool_size) pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); @@ -1676,6 +1592,7 @@ static int disk_update_zone_resources(struct gendisk *disk, lim.max_open_zones = 0; } +commit: return queue_limits_commit_update(q, &lim); } @@ -1683,7 +1600,6 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, struct blk_revalidate_zone_args *args) { struct gendisk *disk = args->disk; - struct request_queue *q = disk->queue; if (zone->capacity != zone->len) { pr_warn("%s: Invalid conventional zone capacity\n", @@ -1699,7 +1615,7 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, if (!args->conv_zones_bitmap) { args->conv_zones_bitmap = - blk_alloc_zone_bitmap(q->node, args->nr_zones); + bitmap_zalloc(args->nr_zones, GFP_NOIO); if (!args->conv_zones_bitmap) return -ENOMEM; } diff --git a/block/blk.h b/block/blk.h index 189bc25beb50..8e8936e97307 100644 --- a/block/blk.h +++ b/block/blk.h @@ -98,8 +98,8 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { unsigned long mask = queue_segment_boundary(q); - phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; - phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + phys_addr_t addr1 = bvec_phys(vec1); + phys_addr_t addr2 = bvec_phys(vec2); /* * Merging adjacent physical pages may not work correctly under KMSAN @@ -181,9 +181,11 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) return queue_max_segments(rq->q); } -static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, - enum req_op op) +static inline unsigned int blk_queue_get_max_sectors(struct request *rq) { + struct request_queue *q = rq->q; + enum req_op op = req_op(rq); + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); @@ -191,6 +193,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; + if (rq->cmd_flags & REQ_ATOMIC) + return q->limits.atomic_write_max_sectors; + return q->limits.max_sectors; } @@ -352,6 +357,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); int blk_set_default_limits(struct queue_limits *lim); +void blk_apply_bdi_limits(struct backing_dev_info *bdi, + struct queue_limits *lim); int blk_dev_init(void); /* @@ -393,7 +400,7 @@ struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); static inline bool blk_queue_may_bounce(struct request_queue *q) { return IS_ENABLED(CONFIG_BOUNCE) && - q->limits.bounce == BLK_BOUNCE_HIGH && + (q->limits.features & BLK_FEAT_BOUNCE_HIGH) && max_low_pfn >= max_pfn; } @@ -673,4 +680,9 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); +void blk_integrity_generate(struct bio *bio); +void blk_integrity_verify(struct bio *bio); +void blk_integrity_prepare(struct request *rq); +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); + #endif /* BLK_INTERNAL_H */ diff --git a/block/elevator.c b/block/elevator.c index f64ebd726e58..f13d552a32c8 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -709,24 +709,25 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return ret; } -ssize_t elv_iosched_store(struct request_queue *q, const char *buf, +ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; int ret; - if (!elv_support_iosched(q)) + if (!elv_support_iosched(disk->queue)) return count; strscpy(elevator_name, buf, sizeof(elevator_name)); - ret = elevator_change(q, strstrip(elevator_name)); + ret = elevator_change(disk->queue, strstrip(elevator_name)); if (!ret) return count; return ret; } -ssize_t elv_iosched_show(struct request_queue *q, char *name) +ssize_t elv_iosched_show(struct gendisk *disk, char *name) { + struct request_queue *q = disk->queue; struct elevator_queue *eq = q->elevator; struct elevator_type *cur = NULL, *e; int len = 0; diff --git a/block/elevator.h b/block/elevator.h index e9a050a96e53..3fe18e1a8692 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -147,8 +147,8 @@ extern void elv_unregister(struct elevator_type *); /* * io scheduler sysfs switching */ -extern ssize_t elv_iosched_show(struct request_queue *, char *); -extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); +ssize_t elv_iosched_show(struct gendisk *disk, char *page); +ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); extern struct elevator_queue *elevator_alloc(struct request_queue *, diff --git a/block/fops.c b/block/fops.c index 376265935714..9825c1713a49 100644 --- a/block/fops.c +++ b/block/fops.c @@ -34,9 +34,12 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) return opf; } -static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, - struct iov_iter *iter) +static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos, + struct iov_iter *iter, bool is_atomic) { + if (is_atomic && !generic_atomic_write_valid(iter, pos)) + return true; + return pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } @@ -72,6 +75,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_ioprio = iocb->ki_ioprio; + if (iocb->ki_flags & IOCB_ATOMIC) + bio.bi_opf |= REQ_ATOMIC; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) @@ -343,6 +348,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_ATOMIC) + bio->bi_opf |= REQ_ATOMIC; + if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; @@ -359,12 +367,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + bool is_atomic = iocb->ki_flags & IOCB_ATOMIC; unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; - if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) + if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic)) return -EINVAL; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); @@ -373,6 +382,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); + } else if (is_atomic) { + return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } @@ -383,10 +394,11 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct block_device *bdev = I_BDEV(inode); loff_t isize = i_size_read(inode); - iomap->bdev = bdev; - iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); if (offset >= isize) return -EIO; + + iomap->bdev = bdev; + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; @@ -612,6 +624,9 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (!bdev) return -ENXIO; + if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) blkdev_put_no_open(bdev); diff --git a/block/genhd.c b/block/genhd.c index 8f1f3c6b4d67..4dc95a463505 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -524,7 +524,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } - disk_update_readahead(disk); + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); disk_add_events(disk); set_bit(GD_ADDED, &disk->state); return 0; diff --git a/block/ioctl.c b/block/ioctl.c index d570e1695896..e8e4a4190f18 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -224,7 +224,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, goto fail; err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, - BLKDEV_ZERO_NOUNMAP); + BLKDEV_ZERO_NOUNMAP | BLKDEV_ZERO_KILLABLE); fail: filemap_invalidate_unlock(bdev->bd_mapping); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 94eede4fb9eb..acdc28756d9d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -488,6 +488,20 @@ unlock: } /* + * 'depth' is a number in the range 1..INT_MAX representing a number of + * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since + * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). + * Values larger than q->nr_requests have the same effect as q->nr_requests. + */ +static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) +{ + struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; + const unsigned int nrr = hctx->queue->nr_requests; + + return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; +} + +/* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ @@ -503,7 +517,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ - data->shallow_depth = dd->async_depth; + data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); } /* Called by blk_mq_update_nr_requests(). */ @@ -513,9 +527,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + dd->async_depth = q->nr_requests; - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ diff --git a/block/t10-pi.c b/block/t10-pi.c index f4cc91156da1..425e2836f3e1 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -11,17 +11,23 @@ #include <linux/module.h> #include <net/checksum.h> #include <asm/unaligned.h> +#include "blk.h" + +struct blk_integrity_iter { + void *prot_buf; + void *data_buf; + sector_t seed; + unsigned int data_size; + unsigned short interval; + const char *disk_name; +}; -typedef __be16 (csum_fn) (__be16, void *, unsigned int); - -static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len) -{ - return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len)); -} - -static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) +static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, + unsigned char csum_type) { - return (__force __be16)ip_compute_csum(data, len); + if (csum_type == BLK_INTEGRITY_CSUM_IP) + return (__force __be16)ip_compute_csum(data, len); + return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len)); } /* @@ -29,48 +35,44 @@ static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref * tag. */ -static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, - csum_fn *fn, enum t10_dif_type type) +static void t10_pi_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf + offset; - pi->guard_tag = fn(0, iter->data_buf, iter->interval); + pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval, + bi->csum_type); if (offset) - pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, - offset); + pi->guard_tag = t10_pi_csum(pi->guard_tag, + iter->prot_buf, offset, bi->csum_type); pi->app_tag = 0; - if (type == T10_PI_TYPE1_PROTECTION) + if (bi->flags & BLK_INTEGRITY_REF_TAG) pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); else pi->ref_tag = 0; iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } - - return BLK_STS_OK; } static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - csum_fn *fn, enum t10_dif_type type) + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; - BUG_ON(type == T10_PI_TYPE0_PROTECTION); - for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf + offset; __be16 csum; - if (type == T10_PI_TYPE1_PROTECTION || - type == T10_PI_TYPE2_PROTECTION) { + if (bi->flags & BLK_INTEGRITY_REF_TAG) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -82,15 +84,17 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, iter->seed, be32_to_cpu(pi->ref_tag)); return BLK_STS_PROTECTION; } - } else if (type == T10_PI_TYPE3_PROTECTION) { + } else { if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; } - csum = fn(0, iter->data_buf, iter->interval); + csum = t10_pi_csum(0, iter->data_buf, iter->interval, + bi->csum_type); if (offset) - csum = fn(csum, iter->prot_buf, offset); + csum = t10_pi_csum(csum, iter->prot_buf, offset, + bi->csum_type); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -102,33 +106,13 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } return BLK_STS_OK; } -static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); -} - /** * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared @@ -141,7 +125,7 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) */ static void t10_pi_type1_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; @@ -192,7 +176,7 @@ static void t10_pi_type1_prepare(struct request *rq) */ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); @@ -225,81 +209,15 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -/* Type 3 does not have a reference tag so no remapping is required. */ -static void t10_pi_type3_prepare(struct request *rq) -{ -} - -/* Type 3 does not have a reference tag so no remapping is required. */ -static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) -{ -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, - .prepare_fn = t10_pi_type1_prepare, - .complete_fn = t10_pi_type1_complete, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, - .prepare_fn = t10_pi_type1_prepare, - .complete_fn = t10_pi_type1_complete, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) { return cpu_to_be64(crc64_rocksoft_update(crc, data, len)); } -static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, - enum t10_dif_type type) +static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { @@ -311,17 +229,15 @@ static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, iter->prot_buf, offset); pi->app_tag = 0; - if (type == T10_PI_TYPE1_PROTECTION) + if (bi->flags & BLK_INTEGRITY_REF_TAG) put_unaligned_be48(iter->seed, pi->ref_tag); else put_unaligned_be48(0ULL, pi->ref_tag); iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } - - return BLK_STS_OK; } static bool ext_pi_ref_escape(u8 *ref_tag) @@ -332,9 +248,9 @@ static bool ext_pi_ref_escape(u8 *ref_tag) } static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, - enum t10_dif_type type) + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0; i < iter->data_size; i += iter->interval) { @@ -342,7 +258,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, u64 ref, seed; __be64 csum; - if (type == T10_PI_TYPE1_PROTECTION) { + if (bi->flags & BLK_INTEGRITY_REF_TAG) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -353,7 +269,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, iter->disk_name, seed, ref); return BLK_STS_PROTECTION; } - } else if (type == T10_PI_TYPE3_PROTECTION) { + } else { if (pi->app_tag == T10_PI_APP_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) goto next; @@ -374,26 +290,16 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } return BLK_STS_OK; } -static blk_status_t ext_pi_type1_verify_crc64(struct blk_integrity_iter *iter) -{ - return ext_pi_crc64_verify(iter, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) -{ - return ext_pi_crc64_generate(iter, T10_PI_TYPE1_PROTECTION); -} - static void ext_pi_type1_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; @@ -433,7 +339,7 @@ static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); @@ -467,33 +373,105 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -static blk_status_t ext_pi_type3_verify_crc64(struct blk_integrity_iter *iter) +void blk_integrity_generate(struct bio *bio) { - return ext_pi_crc64_verify(iter, T10_PI_TYPE3_PROTECTION); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter; + struct bvec_iter bviter; + struct bio_vec bv; + + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.interval = 1 << bi->interval_exp; + iter.seed = bio->bi_iter.bi_sector; + iter.prot_buf = bvec_virt(bip->bip_vec); + bio_for_each_segment(bv, bio, bviter) { + void *kaddr = bvec_kmap_local(&bv); + + iter.data_buf = kaddr; + iter.data_size = bv.bv_len; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + ext_pi_crc64_generate(&iter, bi); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + t10_pi_generate(&iter, bi); + break; + default: + break; + } + kunmap_local(kaddr); + } } -static blk_status_t ext_pi_type3_generate_crc64(struct blk_integrity_iter *iter) +void blk_integrity_verify(struct bio *bio) { - return ext_pi_crc64_generate(iter, T10_PI_TYPE3_PROTECTION); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter; + struct bvec_iter bviter; + struct bio_vec bv; + + /* + * At the moment verify is called bi_iter has been advanced during split + * and completion, so use the copy created during submission here. + */ + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.interval = 1 << bi->interval_exp; + iter.seed = bip->bio_iter.bi_sector; + iter.prot_buf = bvec_virt(bip->bip_vec); + __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) { + void *kaddr = bvec_kmap_local(&bv); + blk_status_t ret = BLK_STS_OK; + + iter.data_buf = kaddr; + iter.data_size = bv.bv_len; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + ret = ext_pi_crc64_verify(&iter, bi); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + ret = t10_pi_verify(&iter, bi); + break; + default: + break; + } + kunmap_local(kaddr); + + if (ret) { + bio->bi_status = ret; + return; + } + } } -const struct blk_integrity_profile ext_pi_type1_crc64 = { - .name = "EXT-DIF-TYPE1-CRC64", - .generate_fn = ext_pi_type1_generate_crc64, - .verify_fn = ext_pi_type1_verify_crc64, - .prepare_fn = ext_pi_type1_prepare, - .complete_fn = ext_pi_type1_complete, -}; -EXPORT_SYMBOL_GPL(ext_pi_type1_crc64); - -const struct blk_integrity_profile ext_pi_type3_crc64 = { - .name = "EXT-DIF-TYPE3-CRC64", - .generate_fn = ext_pi_type3_generate_crc64, - .verify_fn = ext_pi_type3_verify_crc64, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL_GPL(ext_pi_type3_crc64); +void blk_integrity_prepare(struct request *rq) +{ + struct blk_integrity *bi = &rq->q->limits.integrity; + + if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) + ext_pi_type1_prepare(rq); + else + t10_pi_type1_prepare(rq); +} + +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ + struct blk_integrity *bi = &rq->q->limits.integrity; + + if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) + ext_pi_type1_complete(rq, nr_bytes); + else + t10_pi_type1_complete(rq, nr_bytes); +} MODULE_DESCRIPTION("T10 Protection Information module"); MODULE_LICENSE("GPL"); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index bd6a7857ce05..831fa4a12159 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -16,7 +16,6 @@ #include <linux/acpi.h> #include <linux/dmi.h> #include <linux/sched.h> /* need_resched() */ -#include <linux/sort.h> #include <linux/tick.h> #include <linux/cpuidle.h> #include <linux/cpu.h> @@ -386,25 +385,24 @@ static void acpi_processor_power_verify_c3(struct acpi_processor *pr, acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 1); } -static int acpi_cst_latency_cmp(const void *a, const void *b) +static void acpi_cst_latency_sort(struct acpi_processor_cx *states, size_t length) { - const struct acpi_processor_cx *x = a, *y = b; + int i, j, k; - if (!(x->valid && y->valid)) - return 0; - if (x->latency > y->latency) - return 1; - if (x->latency < y->latency) - return -1; - return 0; -} -static void acpi_cst_latency_swap(void *a, void *b, int n) -{ - struct acpi_processor_cx *x = a, *y = b; + for (i = 1; i < length; i++) { + if (!states[i].valid) + continue; - if (!(x->valid && y->valid)) - return; - swap(x->latency, y->latency); + for (j = i - 1, k = i; j >= 0; j--) { + if (!states[j].valid) + continue; + + if (states[j].latency > states[k].latency) + swap(states[j].latency, states[k].latency); + + k = j; + } + } } static int acpi_processor_power_verify(struct acpi_processor *pr) @@ -449,10 +447,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr) if (buggy_latency) { pr_notice("FW issue: working around C-state latencies out of order\n"); - sort(&pr->power.states[1], max_cstate, - sizeof(struct acpi_processor_cx), - acpi_cst_latency_cmp, - acpi_cst_latency_swap); + acpi_cst_latency_sort(&pr->power.states[1], max_cstate); } lapic_timer_propagate_broadcast(pr); diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index bb4d30d377ae..67ce756f8dfc 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1024,7 +1024,6 @@ EXPORT_SYMBOL_GPL(ata_scsi_dma_need_drain); int ata_scsi_dev_config(struct scsi_device *sdev, struct queue_limits *lim, struct ata_device *dev) { - struct request_queue *q = sdev->request_queue; int depth = 1; if (!ata_id_has_unload(dev->id)) @@ -1038,7 +1037,7 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct queue_limits *lim, sdev->sector_size = ATA_SECT_SIZE; /* set DMA padding */ - blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1); + lim->dma_pad_mask = ATA_DMA_PAD_SZ - 1; /* make room for appending the drain */ lim->max_segments--; diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index 3cb455a32d92..1b85e8bf4ef9 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -816,7 +816,7 @@ static int pata_macio_device_configure(struct scsi_device *sdev, /* OHare has issues with non cache aligned DMA on some chipsets */ if (priv->kind == controller_ohare) { lim->dma_alignment = 31; - blk_queue_update_dma_pad(sdev->request_queue, 31); + lim->dma_pad_mask = 31; /* Tell the world about it */ ata_dev_info(dev, "OHare alignment limits applied\n"); @@ -831,7 +831,7 @@ static int pata_macio_device_configure(struct scsi_device *sdev, if (priv->kind == controller_sh_ata6 || priv->kind == controller_k2_ata6) { /* Allright these are bad, apply restrictions */ lim->dma_alignment = 15; - blk_queue_update_dma_pad(sdev->request_queue, 15); + lim->dma_pad_mask = 15; /* We enable MWI and hack cache line size directly here, this * is specific to this chipset and not normal values, we happen diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 5b9d4aaebb81..ed209f4f2798 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -354,6 +354,15 @@ config VIRTIO_BLK This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. +config BLK_DEV_RUST_NULL + tristate "Rust null block driver (Experimental)" + depends on RUST + help + This is the Rust implementation of the null block driver. For now it + is only a minimal stub. + + If unsure, say N. + config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 101612cba303..1105a2d4fdcb 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -9,6 +9,9 @@ # needed for trace events ccflags-y += -I$(src) +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o +rnull_mod-y := rnull.o + obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index a25414228e47..49ced65bef4c 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -232,6 +232,7 @@ static DEFINE_MUTEX(amiflop_mutex); static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */ module_param(fd_def_df0, ulong, 0); +MODULE_DESCRIPTION("Amiga floppy driver"); MODULE_LICENSE("GPL"); /* @@ -1776,10 +1777,13 @@ static const struct blk_mq_ops amiflop_mq_ops = { static int fd_alloc_disk(int drive, int system) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, &lim, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b6dac8cee70f..2028795ec61c 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -337,6 +337,7 @@ aoeblk_gdalloc(void *vp) struct queue_limits lim = { .max_hw_sectors = aoe_maxsectors, .io_opt = SZ_2M, + .features = BLK_FEAT_ROTATIONAL, }; ulong flags; int late = 0; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index cacc4ba942a8..4ba98c6654be 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1992,9 +1992,12 @@ static const struct blk_mq_ops ataflop_mq_ops = { static int ataflop_alloc_disk(unsigned int drive, unsigned int type) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct gendisk *disk; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, &lim, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); @@ -2197,4 +2200,5 @@ static void __exit atari_floppy_exit(void) module_init(atari_floppy_init) module_exit(atari_floppy_exit) +MODULE_DESCRIPTION("Atari floppy driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 558d8e670566..2fd1ed101748 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -296,6 +296,7 @@ static int max_part = 1; module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); +MODULE_DESCRIPTION("Ram backed block device driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -335,6 +336,8 @@ static int brd_alloc(int i) .max_hw_discard_sectors = UINT_MAX, .max_discard_segments = 1, .discard_granularity = PAGE_SIZE, + .features = BLK_FEAT_SYNCHRONOUS | + BLK_FEAT_NOWAIT, }; list_for_each_entry(brd, &brd_devices, brd_list) @@ -366,10 +369,6 @@ static int brd_alloc(int i) strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); - /* Tell the block layer that this is not a rotational device */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); if (err) goto out_cleanup_disk; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 113b441d4d36..f92673f05c7a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2697,6 +2697,9 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig * connect. */ .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_ROTATIONAL | + BLK_FEAT_STABLE_WRITES, }; device = minor_to_device(minor); @@ -2735,9 +2738,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig sprintf(disk->disk_name, "drbd%d", minor); disk->private_data = device; - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); - blk_queue_write_cache(disk->queue, true, true); - device->md_io.page = alloc_page(GFP_KERNEL); if (!device->md_io.page) goto out_no_io_page; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 25c9d85667f1..3affb538b989 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4516,7 +4516,8 @@ static bool floppy_available(int drive) static int floppy_alloc_disk(unsigned int drive, unsigned int type) { struct queue_limits lim = { - .max_hw_sectors = 64, + .max_hw_sectors = 64, + .features = BLK_FEAT_ROTATIONAL, }; struct gendisk *disk; @@ -5016,6 +5017,7 @@ module_param(floppy, charp, 0); module_param(FLOPPY_IRQ, int, 0); module_param(FLOPPY_DMA, int, 0); MODULE_AUTHOR("Alain L. Knaff"); +MODULE_DESCRIPTION("Normal floppy disk support"); MODULE_LICENSE("GPL"); /* This doesn't actually get used other than for module information */ diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1153721bc7c2..78a7bb28defe 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -211,13 +211,10 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) if (lo->lo_state == Lo_bound) blk_mq_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; - if (use_dio) { - blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue); + if (use_dio) lo->lo_flags |= LO_FLAGS_DIRECT_IO; - } else { - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); + else lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - } if (lo->lo_state == Lo_bound) blk_mq_unfreeze_queue(lo->lo_queue); } @@ -939,24 +936,6 @@ static void loop_free_idle_workers_timer(struct timer_list *timer) return loop_free_idle_workers(lo, false); } -static void loop_update_rotational(struct loop_device *lo) -{ - struct file *file = lo->lo_backing_file; - struct inode *file_inode = file->f_mapping->host; - struct block_device *file_bdev = file_inode->i_sb->s_bdev; - struct request_queue *q = lo->lo_queue; - bool nonrot = true; - - /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ - if (file_bdev) - nonrot = bdev_nonrot(file_bdev); - - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); -} - /** * loop_set_status_from_info - configure device from loop_info * @lo: struct loop_device to configure @@ -998,17 +977,40 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } -static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize, - bool update_discard_settings) +static unsigned short loop_default_blocksize(struct loop_device *lo, + struct block_device *backing_bdev) { + /* In case of direct I/O, match underlying block size */ + if ((lo->lo_backing_file->f_flags & O_DIRECT) && backing_bdev) + return bdev_logical_block_size(backing_bdev); + return SECTOR_SIZE; +} + +static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct block_device *backing_bdev = NULL; struct queue_limits lim; + if (S_ISBLK(inode->i_mode)) + backing_bdev = I_BDEV(inode); + else if (inode->i_sb->s_bdev) + backing_bdev = inode->i_sb->s_bdev; + + if (!bsize) + bsize = loop_default_blocksize(lo, backing_bdev); + lim = queue_limits_start_update(lo->lo_queue); lim.logical_block_size = bsize; lim.physical_block_size = bsize; lim.io_min = bsize; - if (update_discard_settings) - loop_config_discard(lo, &lim); + lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); + if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) + lim.features |= BLK_FEAT_WRITE_CACHE; + if (backing_bdev && !bdev_nonrot(backing_bdev)) + lim.features |= BLK_FEAT_ROTATIONAL; + loop_config_discard(lo, &lim); return queue_limits_commit_update(lo->lo_queue, &lim); } @@ -1017,12 +1019,10 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, const struct loop_config *config) { struct file *file = fget(config->fd); - struct inode *inode; struct address_space *mapping; int error; loff_t size; bool partscan; - unsigned short bsize; bool is_loop; if (!file) @@ -1055,19 +1055,12 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, goto out_unlock; mapping = file->f_mapping; - inode = mapping->host; if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; goto out_unlock; } - if (config->block_size) { - error = blk_validate_block_size(config->block_size); - if (error) - goto out_unlock; - } - error = loop_set_status_from_info(lo, &config->info); if (error) goto out_unlock; @@ -1098,22 +1091,10 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) - blk_queue_write_cache(lo->lo_queue, true, false); - - if (config->block_size) - bsize = config->block_size; - else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev) - /* In case of direct I/O, match underlying block size */ - bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - else - bsize = 512; - - error = loop_reconfigure_limits(lo, bsize, true); - if (WARN_ON_ONCE(error)) + error = loop_reconfigure_limits(lo, config->block_size); + if (error) goto out_unlock; - loop_update_rotational(lo); loop_update_dio(lo); loop_sysfs_init(lo); @@ -1154,22 +1135,12 @@ out_putf: return error; } -static void __loop_clr_fd(struct loop_device *lo, bool release) +static void __loop_clr_fd(struct loop_device *lo) { + struct queue_limits lim; struct file *filp; gfp_t gfp = lo->old_gfp_mask; - if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) - blk_queue_write_cache(lo->lo_queue, false, false); - - /* - * Freeze the request queue when unbinding on a live file descriptor and - * thus an open device. When called from ->release we are guaranteed - * that there is no I/O in progress already. - */ - if (!release) - blk_mq_freeze_queue(lo->lo_queue); - spin_lock_irq(&lo->lo_lock); filp = lo->lo_backing_file; lo->lo_backing_file = NULL; @@ -1179,7 +1150,14 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) lo->lo_offset = 0; lo->lo_sizelimit = 0; memset(lo->lo_file_name, 0, LO_NAME_SIZE); - loop_reconfigure_limits(lo, 512, false); + + /* reset the block size to the default */ + lim = queue_limits_start_update(lo->lo_queue); + lim.logical_block_size = SECTOR_SIZE; + lim.physical_block_size = SECTOR_SIZE; + lim.io_min = SECTOR_SIZE; + queue_limits_commit_update(lo->lo_queue, &lim); + invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); /* let user-space know about this change */ @@ -1187,8 +1165,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) mapping_set_gfp_mask(filp->f_mapping, gfp); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - if (!release) - blk_mq_unfreeze_queue(lo->lo_queue); disk_force_media_change(lo->lo_disk); @@ -1203,11 +1179,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) * must be at least one and it can only become zero when the * current holder is released. */ - if (!release) - mutex_lock(&lo->lo_disk->open_mutex); err = bdev_disk_changed(lo->lo_disk, false); - if (!release) - mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo->lo_number, err); @@ -1256,24 +1228,16 @@ static int loop_clr_fd(struct loop_device *lo) return -ENXIO; } /* - * If we've explicitly asked to tear down the loop device, - * and it has an elevated reference count, set it for auto-teardown when - * the last reference goes away. This stops $!~#$@ udev from - * preventing teardown because it decided that it needs to run blkid on - * the loopback device whenever they appear. xfstests is notorious for - * failing tests because blkid via udev races with a losetup - * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d - * command to fail with EBUSY. + * Mark the device for removing the backing device on last close. + * If we are the only opener, also switch the state to roundown here to + * prevent new openers from coming in. */ - if (disk_openers(lo->lo_disk) > 1) { - lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - loop_global_unlock(lo, true); - return 0; - } - lo->lo_state = Lo_rundown; + + lo->lo_flags |= LO_FLAGS_AUTOCLEAR; + if (disk_openers(lo->lo_disk) == 1) + lo->lo_state = Lo_rundown; loop_global_unlock(lo, true); - __loop_clr_fd(lo, false); return 0; } @@ -1500,10 +1464,6 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) if (lo->lo_state != Lo_bound) return -ENXIO; - err = blk_validate_block_size(arg); - if (err) - return err; - if (lo->lo_queue->limits.logical_block_size == arg) return 0; @@ -1511,7 +1471,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - err = loop_reconfigure_limits(lo, arg, false); + err = loop_reconfigure_limits(lo, arg); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); @@ -1740,25 +1700,43 @@ static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, } #endif +static int lo_open(struct gendisk *disk, blk_mode_t mode) +{ + struct loop_device *lo = disk->private_data; + int err; + + err = mutex_lock_killable(&lo->lo_mutex); + if (err) + return err; + + if (lo->lo_state == Lo_deleting || lo->lo_state == Lo_rundown) + err = -ENXIO; + mutex_unlock(&lo->lo_mutex); + return err; +} + static void lo_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; + bool need_clear = false; if (disk_openers(disk) > 0) return; + /* + * Clear the backing device information if this is the last close of + * a device that's been marked for auto clear, or on which LOOP_CLR_FD + * has been called. + */ mutex_lock(&lo->lo_mutex); - if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) { + if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_state = Lo_rundown; - mutex_unlock(&lo->lo_mutex); - /* - * In autoclear mode, stop the loop thread - * and remove configuration after last close. - */ - __loop_clr_fd(lo, true); - return; - } + + need_clear = (lo->lo_state == Lo_rundown); mutex_unlock(&lo->lo_mutex); + + if (need_clear) + __loop_clr_fd(lo); } static void lo_free_disk(struct gendisk *disk) @@ -1775,6 +1753,7 @@ static void lo_free_disk(struct gendisk *disk) static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, + .open = lo_open, .release = lo_release, .ioctl = lo_ioctl, #ifdef CONFIG_COMPAT @@ -1853,6 +1832,7 @@ static const struct kernel_param_ops loop_hw_qdepth_param_ops = { device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); +MODULE_DESCRIPTION("Loopback device support"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -2060,14 +2040,6 @@ static int loop_add(int i) lo->lo_queue = lo->lo_disk->queue; /* - * By default, we do buffer IO, so it doesn't make sense to enable - * merge because the I/O submitted to backing file is handled page by - * page. For directio mode, merge does help to dispatch bigger request - * to underlayer disk. We will enable merge once directio is enabled. - */ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - - /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its * setup. Userspace can always add and remove partitions from all diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 43a187609ef7..c6ef0546ffc9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3485,8 +3485,6 @@ skip_create_disk: goto start_service_thread; /* Set device limits. */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue); dma_set_max_seg_size(&dd->pdev->dev, 0x400000); /* Set the capacity of the device in 512 byte sectors. */ diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 27b2187e7a6d..b9fdeff31caf 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -150,8 +150,6 @@ static int __init n64cart_probe(struct platform_device *pdev) set_capacity(disk, size >> SECTOR_SHIFT); set_disk_ro(disk, 1); - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - err = add_disk(disk); if (err) goto out_cleanup_disk; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b87aa80a46dd..41a90150b501 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -342,6 +342,14 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim.max_hw_discard_sectors = UINT_MAX; else lim.max_hw_discard_sectors = 0; + if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) { + lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); + } else if (nbd->config->flags & NBD_FLAG_SEND_FUA) { + lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; + } else { + lim.features |= BLK_FEAT_WRITE_CACHE; + lim.features &= ~BLK_FEAT_FUA; + } lim.logical_block_size = blksize; lim.physical_block_size = blksize; error = queue_limits_commit_update(nbd->disk->queue, &lim); @@ -1279,19 +1287,10 @@ static void nbd_bdev_reset(struct nbd_device *nbd) static void nbd_parse_flags(struct nbd_device *nbd) { - struct nbd_config *config = nbd->config; - if (config->flags & NBD_FLAG_READ_ONLY) + if (nbd->config->flags & NBD_FLAG_READ_ONLY) set_disk_ro(nbd->disk, true); else set_disk_ro(nbd->disk, false); - if (config->flags & NBD_FLAG_SEND_FLUSH) { - if (config->flags & NBD_FLAG_SEND_FUA) - blk_queue_write_cache(nbd->disk->queue, true, true); - else - blk_queue_write_cache(nbd->disk->queue, true, false); - } - else - blk_queue_write_cache(nbd->disk->queue, false, false); } static void send_disconnects(struct nbd_device *nbd) @@ -1801,7 +1800,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { struct queue_limits lim = { .max_hw_sectors = 65536, - .max_user_sectors = 256, + .io_opt = 256 << SECTOR_SHIFT, .max_segments = USHRT_MAX, .max_segment_size = UINT_MAX, }; @@ -1861,11 +1860,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) goto out_err_disk; } - /* - * Tell the block layer that we are not a rotational device - */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); /* diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 75f189e42f88..2f0431e42c49 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -77,7 +77,7 @@ enum { NULL_IRQ_TIMER = 2, }; -static bool g_virt_boundary = false; +static bool g_virt_boundary; module_param_named(virt_boundary, g_virt_boundary, bool, 0444); MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); @@ -227,7 +227,7 @@ MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 ( static bool g_fua = true; module_param_named(fua, g_fua, bool, 0444); -MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true"); +MODULE_PARM_DESC(fua, "Enable/disable FUA support when cache_size is used. Default: true"); static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); @@ -262,6 +262,10 @@ module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444 MODULE_PARM_DESC(zone_append_max_sectors, "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); +static bool g_zone_full; +module_param_named(zone_full, g_zone_full, bool, S_IRUGO); +MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -458,6 +462,7 @@ NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); +NULLB_DEVICE_ATTR(zone_full, bool, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); @@ -610,6 +615,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_readonly, &nullb_device_attr_zone_offline, + &nullb_device_attr_zone_full, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, &nullb_device_attr_shared_tags, @@ -700,7 +706,7 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page) "shared_tags,size,submit_queues,use_per_node_hctx," "virt_boundary,zoned,zone_capacity,zone_max_active," "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size,zone_append_max_sectors\n"); + "zone_size,zone_append_max_sectors,zone_full\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -781,6 +787,7 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; dev->zone_append_max_sectors = g_zone_append_max_sectors; + dev->zone_full = g_zone_full; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; dev->shared_tags = g_shared_tags; @@ -1824,9 +1831,6 @@ static int null_validate_conf(struct nullb_device *dev) dev->queue_mode = NULL_Q_MQ; } - if (blk_validate_block_size(dev->blocksize)) - return -EINVAL; - if (dev->use_per_node_hctx) { if (dev->submit_queues != nr_online_nodes) dev->submit_queues = nr_online_nodes; @@ -1928,6 +1932,13 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_tags; } + if (dev->cache_size > 0) { + set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); + lim.features |= BLK_FEAT_WRITE_CACHE; + if (dev->fua) + lim.features |= BLK_FEAT_FUA; + } + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb); if (IS_ERR(nullb->disk)) { rv = PTR_ERR(nullb->disk); @@ -1940,13 +1951,7 @@ static int null_add_dev(struct nullb_device *dev) nullb_setup_bwtimer(nullb); } - if (dev->cache_size > 0) { - set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, dev->fua); - } - nullb->q->queuedata = nullb; - blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); rv = ida_alloc(&nullb_indexes, GFP_KERNEL); if (rv < 0) diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 3234e6c85eed..a7bb32f73ec3 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -101,6 +101,7 @@ struct nullb_device { bool memory_backed; /* if data is stored in memory */ bool discard; /* if support discard */ bool zoned; /* if device is zoned */ + bool zone_full; /* Initialize zones to be full */ bool virt_boundary; /* virtual boundary on/off for the device */ bool no_sched; /* no IO scheduler for the device */ bool shared_tags; /* share tag set between devices for blk-mq */ diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index f118d304f310..9bc768b2ca56 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -145,7 +145,7 @@ int null_init_zoned_dev(struct nullb_device *dev, zone = &dev->zones[i]; null_init_zone_lock(dev, zone); - zone->start = zone->wp = sector; + zone->start = sector; if (zone->start + dev->zone_size_sects > dev_capacity_sects) zone->len = dev_capacity_sects - zone->start; else @@ -153,12 +153,18 @@ int null_init_zoned_dev(struct nullb_device *dev, zone->capacity = min_t(sector_t, zone->len, zone_capacity_sects); zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; - zone->cond = BLK_ZONE_COND_EMPTY; + if (dev->zone_full) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = zone->start + zone->capacity; + } else{ + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + } sector += dev->zone_size_sects; } - lim->zoned = true; + lim->features |= BLK_FEAT_ZONED; lim->chunk_sectors = dev->zone_size_sects; lim->max_zone_append_sectors = dev->zone_append_max_sectors; lim->max_open_zones = dev->zone_max_open; @@ -171,9 +177,6 @@ int null_register_zoned_dev(struct nullb *nullb) struct request_queue *q = nullb->q; struct gendisk *disk = nullb->disk; - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - disk->nr_zones = bdev_nr_zones(disk->part0); - pr_info("%s: using %s zone append\n", disk->disk_name, queue_emulates_zone_append(q) ? "emulated" : "native"); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 8a2ce8070010..7cece5884b9c 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2622,6 +2622,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) struct queue_limits lim = { .max_hw_sectors = PACKET_MAX_SECTORS, .logical_block_size = CD_FRAMESIZE, + .features = BLK_FEAT_ROTATIONAL, }; int idx; int ret = -ENOMEM; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index b810ac0a5c4b..ff45ed766469 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -388,9 +388,9 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) .max_segments = -1, .max_segment_size = dev->bounce_size, .dma_alignment = dev->blk_size - 1, + .features = BLK_FEAT_WRITE_CACHE | + BLK_FEAT_ROTATIONAL, }; - - struct request_queue *queue; struct gendisk *gendisk; if (dev->blk_size < 512) { @@ -447,10 +447,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) goto fail_free_tag_set; } - queue = gendisk->queue; - - blk_queue_write_cache(queue, true, false); - priv->gendisk = gendisk; gendisk->major = ps3disk_major; gendisk->first_minor = devidx * PS3DISK_MINORS; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 26ff5cd2bf0a..008e850555f4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4949,14 +4949,12 @@ static const struct blk_mq_ops rbd_mq_ops = { static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; - struct request_queue *q; unsigned int objset_bytes = rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; struct queue_limits lim = { .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, - .max_user_sectors = objset_bytes >> SECTOR_SHIFT, + .io_opt = objset_bytes, .io_min = rbd_dev->opts->alloc_size, - .io_opt = rbd_dev->opts->alloc_size, .max_segments = USHRT_MAX, .max_segment_size = UINT_MAX, }; @@ -4980,12 +4978,14 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT; } + if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) + lim.features |= BLK_FEAT_STABLE_WRITES; + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_tag_set; } - q = disk->queue; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->dev_id); @@ -4997,13 +4997,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->minors = RBD_MINORS_PER_MAJOR; disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ - - if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); - rbd_dev->disk = disk; return 0; diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 39887556cf95..6ea7c12e3a87 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -475,7 +475,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) } } -static struct kobj_type rnbd_dev_ktype = { +static const struct kobj_type rnbd_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = rnbd_dev_groups, }; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index b7ffe03c6160..c34695d2eea7 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1352,10 +1352,6 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, if (dev->access_mode == RNBD_ACCESS_RO) set_disk_ro(dev->gd, true); - /* - * Network device does not need rotational - */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); err = add_disk(dev->gd); if (err) put_disk(dev->gd); @@ -1389,18 +1385,18 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, le32_to_cpu(rsp->max_discard_sectors); } + if (rsp->cache_policy & RNBD_WRITEBACK) { + lim.features |= BLK_FEAT_WRITE_CACHE; + if (rsp->cache_policy & RNBD_FUA) + lim.features |= BLK_FEAT_FUA; + } + dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev); if (IS_ERR(dev->gd)) return PTR_ERR(dev->gd); dev->queue = dev->gd->queue; rnbd_init_mq_hw_queues(dev); - blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); - blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); - blk_queue_write_cache(dev->queue, - !!(rsp->cache_policy & RNBD_WRITEBACK), - !!(rsp->cache_policy & RNBD_FUA)); - return rnbd_clt_setup_gen_disk(dev, rsp, idx); } diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index cba6ba43c2c2..64780094442c 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -33,7 +33,7 @@ static void rnbd_srv_dev_release(struct kobject *kobj) kfree(dev); } -static struct kobj_type dev_ktype = { +static const struct kobj_type dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = rnbd_srv_dev_release }; @@ -184,7 +184,7 @@ static void rnbd_srv_sess_dev_release(struct kobject *kobj) rnbd_destroy_sess_dev(sess_dev, sess_dev->keep_id); } -static struct kobj_type rnbd_srv_sess_dev_ktype = { +static const struct kobj_type rnbd_srv_sess_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = rnbd_srv_sess_dev_release, }; diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs new file mode 100644 index 000000000000..b0227cf9ddd3 --- /dev/null +++ b/drivers/block/rnull.rs @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This is a Rust implementation of the C null block driver. +//! +//! Supported features: +//! +//! - blk-mq interface +//! - direct completion +//! - block size 4k +//! +//! The driver is not configurable. + +use kernel::{ + alloc::flags, + block::mq::{ + self, + gen_disk::{self, GenDisk}, + Operations, TagSet, + }, + error::Result, + new_mutex, pr_info, + prelude::*, + sync::{Arc, Mutex}, + types::ARef, +}; + +module! { + type: NullBlkModule, + name: "rnull_mod", + author: "Andreas Hindborg", + license: "GPL v2", +} + +struct NullBlkModule { + _disk: Pin<Box<Mutex<GenDisk<NullBlkDevice>>>>, +} + +impl kernel::Module for NullBlkModule { + fn init(_module: &'static ThisModule) -> Result<Self> { + pr_info!("Rust null_blk loaded\n"); + let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; + + let disk = gen_disk::GenDiskBuilder::new() + .capacity_sectors(4096 << 11) + .logical_block_size(4096)? + .physical_block_size(4096)? + .rotational(false) + .build(format_args!("rnullb{}", 0), tagset)?; + + let disk = Box::pin_init(new_mutex!(disk, "nullb:disk"), flags::GFP_KERNEL)?; + + Ok(Self { _disk: disk }) + } +} + +struct NullBlkDevice; + +#[vtable] +impl Operations for NullBlkDevice { + #[inline(always)] + fn queue_rq(rq: ARef<mq::Request<Self>>, _is_last: bool) -> Result { + mq::Request::end_ok(rq) + .map_err(|_e| kernel::error::code::EIO) + // We take no refcounts on the request, so we expect to be able to + // end the request. The request reference must be unique at this + // point, and so `end_ok` cannot fail. + .expect("Fatal error - expected to be able to end request"); + + Ok(()) + } + + fn commit_rqs() {} +} diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 5286cb8e0824..2d38331ee667 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -791,6 +791,7 @@ static int probe_disk(struct vdc_port *port) .seg_boundary_mask = PAGE_SIZE - 1, .max_segment_size = PAGE_SIZE, .max_segments = port->ring_cookies, + .features = BLK_FEAT_ROTATIONAL, }; struct request_queue *q; struct gendisk *g; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 6731678f3a41..126f151c4f2c 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -787,6 +787,9 @@ static void swim_cleanup_floppy_disk(struct floppy_state *fs) static int swim_floppy_init(struct swim_priv *swd) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; int err; int drive; struct swim __iomem *base = swd->base; @@ -820,7 +823,7 @@ static int swim_floppy_init(struct swim_priv *swd) goto exit_put_disks; swd->unit[drive].disk = - blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL, + blk_mq_alloc_disk(&swd->unit[drive].tag_set, &lim, &swd->unit[drive]); if (IS_ERR(swd->unit[drive].disk)) { blk_mq_free_tag_set(&swd->unit[drive].tag_set); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index a04756ac778e..90be1017f7bf 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1189,6 +1189,9 @@ static int swim3_add_device(struct macio_dev *mdev, int index) static int swim3_attach(struct macio_dev *mdev, const struct of_device_id *match) { + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + }; struct floppy_state *fs; struct gendisk *disk; int rc; @@ -1210,7 +1213,7 @@ static int swim3_attach(struct macio_dev *mdev, if (rc) goto out_unregister; - disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs); + disk = blk_mq_alloc_disk(&fs->tag_set, &lim, fs); if (IS_ERR(disk)) { rc = PTR_ERR(disk); goto out_free_tag_set; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4e159948c912..6fb799ec0d88 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -248,8 +248,6 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); - ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); } @@ -484,16 +482,8 @@ static inline unsigned ublk_pos_to_tag(loff_t pos) static void ublk_dev_param_basic_apply(struct ublk_device *ub) { - struct request_queue *q = ub->ub_disk->queue; const struct ublk_param_basic *p = &ub->params.basic; - blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, - p->attrs & UBLK_ATTR_FUA); - if (p->attrs & UBLK_ATTR_ROTATIONAL) - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - if (p->attrs & UBLK_ATTR_READ_ONLY) set_disk_ro(ub->ub_disk, true); @@ -2204,12 +2194,21 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) return -EOPNOTSUPP; - lim.zoned = true; + lim.features |= BLK_FEAT_ZONED; lim.max_active_zones = p->max_active_zones; lim.max_open_zones = p->max_open_zones; lim.max_zone_append_sectors = p->max_zone_append_sectors; } + if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { + lim.features |= BLK_FEAT_WRITE_CACHE; + if (ub->params.basic.attrs & UBLK_ATTR_FUA) + lim.features |= BLK_FEAT_FUA; + } + + if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) + lim.features |= BLK_FEAT_ROTATIONAL; + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -3017,4 +3016,5 @@ module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644); MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); MODULE_AUTHOR("Ming Lei <[email protected]>"); +MODULE_DESCRIPTION("Userspace block device"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2351f411fa46..e3147a611151 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -728,7 +728,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk, dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); - lim->zoned = true; + lim->features |= BLK_FEAT_ZONED; virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); @@ -1089,14 +1089,6 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev) return writeback; } -static void virtblk_update_cache_mode(struct virtio_device *vdev) -{ - u8 writeback = virtblk_get_cache_mode(vdev); - struct virtio_blk *vblk = vdev->priv; - - blk_queue_write_cache(vblk->disk->queue, writeback, false); -} - static const char *const virtblk_cache_types[] = { "write through", "write back" }; @@ -1108,6 +1100,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr, struct gendisk *disk = dev_to_disk(dev); struct virtio_blk *vblk = disk->private_data; struct virtio_device *vdev = vblk->vdev; + struct queue_limits lim; int i; BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE)); @@ -1116,7 +1109,17 @@ cache_type_store(struct device *dev, struct device_attribute *attr, return i; virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i); - virtblk_update_cache_mode(vdev); + + lim = queue_limits_start_update(disk->queue); + if (virtblk_get_cache_mode(vdev)) + lim.features |= BLK_FEAT_WRITE_CACHE; + else + lim.features &= ~BLK_FEAT_WRITE_CACHE; + blk_mq_freeze_queue(disk->queue); + i = queue_limits_commit_update(disk->queue, &lim); + blk_mq_unfreeze_queue(disk->queue); + if (i) + return i; return count; } @@ -1247,7 +1250,7 @@ static int virtblk_read_limits(struct virtio_blk *vblk, struct queue_limits *lim) { struct virtio_device *vdev = vblk->vdev; - u32 v, blk_size, max_size, sg_elems, opt_io_size; + u32 v, max_size, sg_elems, opt_io_size; u32 max_discard_segs = 0; u32 discard_granularity = 0; u16 min_io_size; @@ -1286,46 +1289,36 @@ static int virtblk_read_limits(struct virtio_blk *vblk, lim->max_segment_size = max_size; /* Host can optionally specify the block size of the device */ - err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, + virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, struct virtio_blk_config, blk_size, - &blk_size); - if (!err) { - err = blk_validate_block_size(blk_size); - if (err) { - dev_err(&vdev->dev, - "virtio_blk: invalid block size: 0x%x\n", - blk_size); - return err; - } - - lim->logical_block_size = blk_size; - } else - blk_size = lim->logical_block_size; + &lim->logical_block_size); /* Use topology information if available */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, physical_block_exp, &physical_block_exp); if (!err && physical_block_exp) - lim->physical_block_size = blk_size * (1 << physical_block_exp); + lim->physical_block_size = + lim->logical_block_size * (1 << physical_block_exp); err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, alignment_offset, &alignment_offset); if (!err && alignment_offset) - lim->alignment_offset = blk_size * alignment_offset; + lim->alignment_offset = + lim->logical_block_size * alignment_offset; err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, min_io_size, &min_io_size); if (!err && min_io_size) - lim->io_min = blk_size * min_io_size; + lim->io_min = lim->logical_block_size * min_io_size; err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, opt_io_size, &opt_io_size); if (!err && opt_io_size) - lim->io_opt = blk_size * opt_io_size; + lim->io_opt = lim->logical_block_size * opt_io_size; if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { virtio_cread(vdev, struct virtio_blk_config, @@ -1419,7 +1412,7 @@ static int virtblk_read_limits(struct virtio_blk *vblk, lim->discard_granularity = discard_granularity << SECTOR_SHIFT; else - lim->discard_granularity = blk_size; + lim->discard_granularity = lim->logical_block_size; } if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) { @@ -1448,7 +1441,10 @@ static int virtblk_read_limits(struct virtio_blk *vblk, static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; - struct queue_limits lim = { }; + struct queue_limits lim = { + .features = BLK_FEAT_ROTATIONAL, + .logical_block_size = SECTOR_SIZE, + }; int err, index; unsigned int queue_depth; @@ -1512,6 +1508,9 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_free_tags; + if (virtblk_get_cache_mode(vdev)) + lim.features |= BLK_FEAT_WRITE_CACHE; + vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk); if (IS_ERR(vblk->disk)) { err = PTR_ERR(vblk->disk); @@ -1527,9 +1526,6 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->fops = &virtblk_fops; vblk->index = index; - /* configure queue flush support */ - virtblk_update_cache_mode(vdev); - /* If disk is read-only in the host, the guest should obey */ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) set_disk_ro(vblk->disk, 1); @@ -1541,8 +1537,8 @@ static int virtblk_probe(struct virtio_device *vdev) * All steps that follow use the VQs therefore they need to be * placed after the virtio_device_ready() call above. */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) { - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (lim.features & BLK_FEAT_ZONED)) { err = blk_revalidate_disk_zones(vblk->disk); if (err) goto out_cleanup_disk; diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 944576d582fb..838064593f62 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1563,5 +1563,6 @@ static void __exit xen_blkif_fini(void) module_exit(xen_blkif_fini); +MODULE_DESCRIPTION("Virtual block device back-end driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("xen-backend:vbd"); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index fd7c0ff2139c..59ce113b882a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -788,6 +788,11 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * A barrier request a superset of FUA, so we can * implement it the same way. (It's also a FLUSH+FUA, * since it is guaranteed ordered WRT previous writes.) + * + * Note that can end up here with a FUA write and the + * flags cleared. This happens when the flag was + * run-time disabled after a failing I/O, and we'll + * simplify submit it as a normal write. */ if (info->feature_flush && info->feature_fua) ring_req->operation = @@ -795,8 +800,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri else if (info->feature_flush) ring_req->operation = BLKIF_OP_FLUSH_DISKCACHE; - else - ring_req->operation = 0; } ring_req->u.rw.nr_segments = num_grant; if (unlikely(require_extra_req)) { @@ -887,16 +890,6 @@ static inline void flush_requests(struct blkfront_ring_info *rinfo) notify_remote_via_irq(rinfo->irq); } -static inline bool blkif_request_flush_invalid(struct request *req, - struct blkfront_info *info) -{ - return (blk_rq_is_passthrough(req) || - ((req_op(req) == REQ_OP_FLUSH) && - !info->feature_flush) || - ((req->cmd_flags & REQ_FUA) && - !info->feature_fua)); -} - static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { @@ -908,12 +901,22 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, rinfo = get_rinfo(info, qid); blk_mq_start_request(qd->rq); spin_lock_irqsave(&rinfo->ring_lock, flags); - if (RING_FULL(&rinfo->ring)) - goto out_busy; - if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) - goto out_err; + /* + * Check if the backend actually supports flushes. + * + * While the block layer won't send us flushes if we don't claim to + * support them, the Xen protocol allows the backend to revoke support + * at any time. That is of course a really bad idea and dangerous, but + * has been allowed for 10+ years. In that case we simply clear the + * flags, and directly return here for an empty flush and ignore the + * FUA flag later on. + */ + if (unlikely(req_op(qd->rq) == REQ_OP_FLUSH && !info->feature_flush)) + goto complete; + if (RING_FULL(&rinfo->ring)) + goto out_busy; if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; @@ -921,14 +924,14 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_OK; -out_err: - spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_STS_IOERR; - out_busy: blk_mq_stop_hw_queue(hctx); spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_STS_DEV_RESOURCE; +complete: + spin_unlock_irqrestore(&rinfo->ring_lock, flags); + blk_mq_end_request(qd->rq, BLK_STS_OK); + return BLK_STS_OK; } static void blkif_complete_rq(struct request *rq) @@ -956,6 +959,12 @@ static void blkif_set_queue_limits(const struct blkfront_info *info, lim->max_secure_erase_sectors = UINT_MAX; } + if (info->feature_flush) { + lim->features |= BLK_FEAT_WRITE_CACHE; + if (info->feature_fua) + lim->features |= BLK_FEAT_FUA; + } + /* Hard sector size and max sectors impersonate the equiv. hardware. */ lim->logical_block_size = info->sector_size; lim->physical_block_size = info->physical_sector_size; @@ -984,8 +993,6 @@ static const char *flush_info(struct blkfront_info *info) static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_write_cache(info->rq, info->feature_flush ? true : false, - info->feature_fua ? true : false); pr_info("blkfront: %s: %s %s %s %s %s %s %s\n", info->gd->disk_name, flush_info(info), "persistent grants:", info->feature_persistent ? @@ -1063,8 +1070,7 @@ static char *encode_disk_name(char *ptr, unsigned int n) } static int xlvbd_alloc_gendisk(blkif_sector_t capacity, - struct blkfront_info *info, u16 sector_size, - unsigned int physical_sector_size) + struct blkfront_info *info) { struct queue_limits lim = {}; struct gendisk *gd; @@ -1139,7 +1145,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, err = PTR_ERR(gd); goto out_free_tag_set; } - blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue); strcpy(gd->disk_name, DEV_NAME); ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); @@ -1159,8 +1164,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, info->rq = gd->queue; info->gd = gd; - info->sector_size = sector_size; - info->physical_sector_size = physical_sector_size; xlvbd_flush(info); @@ -1605,8 +1608,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; - blk_queue_max_discard_sectors(rq, 0); - blk_queue_max_secure_erase_sectors(rq, 0); + blk_queue_disable_discard(rq); + blk_queue_disable_secure_erase(rq); } break; case BLKIF_OP_FLUSH_DISKCACHE: @@ -1627,7 +1630,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_OK; info->feature_fua = 0; info->feature_flush = 0; - xlvbd_flush(info); } fallthrough; case BLKIF_OP_READ: @@ -2315,8 +2317,6 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) static void blkfront_connect(struct blkfront_info *info) { unsigned long long sectors; - unsigned long sector_size; - unsigned int physical_sector_size; int err, i; struct blkfront_ring_info *rinfo; @@ -2355,7 +2355,7 @@ static void blkfront_connect(struct blkfront_info *info) err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "sectors", "%llu", §ors, "info", "%u", &info->vdisk_info, - "sector-size", "%lu", §or_size, + "sector-size", "%lu", &info->sector_size, NULL); if (err) { xenbus_dev_fatal(info->xbdev, err, @@ -2369,9 +2369,9 @@ static void blkfront_connect(struct blkfront_info *info) * provide this. Assume physical sector size to be the same as * sector_size in that case. */ - physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, + info->physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, "physical-sector-size", - sector_size); + info->sector_size); blkfront_gather_backend_features(info); for_each_rinfo(info, rinfo, i) { err = blkfront_setup_indirect(rinfo); @@ -2383,8 +2383,7 @@ static void blkfront_connect(struct blkfront_info *info) } } - err = xlvbd_alloc_gendisk(sectors, info, sector_size, - physical_sector_size); + err = xlvbd_alloc_gendisk(sectors, info); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", info->xbdev->otherend); diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 7c5f4e4d9b50..4b7219be1bb8 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -409,4 +409,5 @@ static void __exit z2_exit(void) module_init(z2_init); module_exit(z2_exit); +MODULE_DESCRIPTION("Amiga Zorro II ramdisk driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3acd7006ad2c..efcb8d9d274c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2208,6 +2208,8 @@ static int zram_add(void) #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE .max_write_zeroes_sectors = UINT_MAX, #endif + .features = BLK_FEAT_STABLE_WRITES | + BLK_FEAT_SYNCHRONOUS, }; struct zram *zram; int ret, device_id; @@ -2245,10 +2247,6 @@ static int zram_add(void) /* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */ set_capacity(zram->disk, 0); - /* zram devices sort of resembles non-rotational disks */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 5b6805d87fcf..dd3c0626c72d 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -382,7 +382,7 @@ static int btintel_pcie_recv_frame(struct btintel_pcie_data *data, /* The first 4 bytes indicates the Intel PCIe specific packet type */ pdata = skb_pull_data(skb, BTINTEL_PCIE_HCI_TYPE_LEN); - if (!data) { + if (!pdata) { bt_dev_err(hdev, "Corrupted packet received"); ret = -EILSEQ; goto exit_error; diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 9d0c7e278114..9bfa9a6ad56c 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -281,7 +281,7 @@ static u8 crc8_table[CRC8_TABLE_SIZE]; /* Default configurations */ #define DEFAULT_H2C_WAKEUP_MODE WAKEUP_METHOD_BREAK -#define DEFAULT_PS_MODE PS_MODE_DISABLE +#define DEFAULT_PS_MODE PS_MODE_ENABLE #define FW_INIT_BAUDRATE HCI_NXP_PRI_BAUDRATE static struct sk_buff *nxp_drv_send_cmd(struct hci_dev *hdev, u16 opcode, diff --git a/drivers/bluetooth/hci_bcm4377.c b/drivers/bluetooth/hci_bcm4377.c index 0c2f15235b4c..d90858ea2fe5 100644 --- a/drivers/bluetooth/hci_bcm4377.c +++ b/drivers/bluetooth/hci_bcm4377.c @@ -495,6 +495,10 @@ struct bcm4377_data; * extended scanning * broken_mws_transport_config: Set to true if the chip erroneously claims to * support MWS Transport Configuration + * broken_le_ext_adv_report_phy: Set to true if this chip stuffs flags inside + * reserved bits of Primary/Secondary_PHY inside + * LE Extended Advertising Report events which + * have to be ignored * send_calibration: Optional callback to send calibration data * send_ptb: Callback to send "PTB" regulatory/calibration data */ @@ -513,6 +517,7 @@ struct bcm4377_hw { unsigned long broken_ext_scan : 1; unsigned long broken_mws_transport_config : 1; unsigned long broken_le_coded : 1; + unsigned long broken_le_ext_adv_report_phy : 1; int (*send_calibration)(struct bcm4377_data *bcm4377); int (*send_ptb)(struct bcm4377_data *bcm4377, @@ -716,7 +721,7 @@ static void bcm4377_handle_ack(struct bcm4377_data *bcm4377, ring->events[msgid] = NULL; } - bitmap_release_region(ring->msgids, msgid, ring->n_entries); + bitmap_release_region(ring->msgids, msgid, 0); unlock: spin_unlock_irqrestore(&ring->lock, flags); @@ -2373,6 +2378,8 @@ static int bcm4377_probe(struct pci_dev *pdev, const struct pci_device_id *id) set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks); if (bcm4377->hw->broken_le_coded) set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks); + if (bcm4377->hw->broken_le_ext_adv_report_phy) + set_bit(HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, &hdev->quirks); pci_set_drvdata(pdev, bcm4377); hci_set_drvdata(hdev, bcm4377); @@ -2477,6 +2484,7 @@ static const struct bcm4377_hw bcm4377_hw_variants[] = { .clear_pciecfg_subsystem_ctrl_bit19 = true, .broken_mws_transport_config = true, .broken_le_coded = true, + .broken_le_ext_adv_report_phy = true, .send_calibration = bcm4387_send_calibration, .send_ptb = bcm4378_send_ptb, }, diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 0c9c9ee56592..9a0bc86f9aac 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2450,15 +2450,27 @@ static void qca_serdev_shutdown(struct device *dev) struct qca_serdev *qcadev = serdev_device_get_drvdata(serdev); struct hci_uart *hu = &qcadev->serdev_hu; struct hci_dev *hdev = hu->hdev; - struct qca_data *qca = hu->priv; const u8 ibs_wake_cmd[] = { 0xFD }; const u8 edl_reset_soc_cmd[] = { 0x01, 0x00, 0xFC, 0x01, 0x05 }; if (qcadev->btsoc_type == QCA_QCA6390) { - if (test_bit(QCA_BT_OFF, &qca->flags) || - !test_bit(HCI_RUNNING, &hdev->flags)) + /* The purpose of sending the VSC is to reset SOC into a initial + * state and the state will ensure next hdev->setup() success. + * if HCI_QUIRK_NON_PERSISTENT_SETUP is set, it means that + * hdev->setup() can do its job regardless of SoC state, so + * don't need to send the VSC. + * if HCI_SETUP is set, it means that hdev->setup() was never + * invoked and the SOC is already in the initial state, so + * don't also need to send the VSC. + */ + if (test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks) || + hci_dev_test_flag(hdev, HCI_SETUP)) return; + /* The serdev must be in open state when conrol logic arrives + * here, so also fix the use-after-free issue caused by that + * the serdev is flushed or wrote after it is closed. + */ serdev_device_write_flush(serdev); ret = serdev_device_write_buf(serdev, ibs_wake_cmd, sizeof(ibs_wake_cmd)); diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 20c90ebb3a3f..49e4829b7264 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3708,4 +3708,5 @@ static void __exit cdrom_exit(void) module_init(cdrom_init); module_exit(cdrom_exit); +MODULE_DESCRIPTION("Uniform CD-ROM driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index eefdd422ad8e..71cfe7a85913 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -744,6 +744,7 @@ static int probe_gdrom(struct platform_device *devptr) .max_segments = 1, /* set a large max size to get most from DMA */ .max_segment_size = 0x40000, + .features = BLK_FEAT_ROTATIONAL, }; int err; diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index d51fc8321d41..da32e8ed0830 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -269,8 +269,13 @@ hpet_read(struct file *file, char __user *buf, size_t count, loff_t * ppos) if (!devp->hd_ireqfreq) return -EIO; - if (count < sizeof(unsigned long)) - return -EINVAL; + if (in_compat_syscall()) { + if (count < sizeof(compat_ulong_t)) + return -EINVAL; + } else { + if (count < sizeof(unsigned long)) + return -EINVAL; + } add_wait_queue(&devp->hd_waitqueue, &wait); @@ -294,9 +299,16 @@ hpet_read(struct file *file, char __user *buf, size_t count, loff_t * ppos) schedule(); } - retval = put_user(data, (unsigned long __user *)buf); - if (!retval) - retval = sizeof(unsigned long); + if (in_compat_syscall()) { + retval = put_user(data, (compat_ulong_t __user *)buf); + if (!retval) + retval = sizeof(compat_ulong_t); + } else { + retval = put_user(data, (unsigned long __user *)buf); + if (!retval) + retval = sizeof(unsigned long); + } + out: __set_current_state(TASK_RUNNING); remove_wait_queue(&devp->hd_waitqueue, &wait); @@ -651,12 +663,24 @@ struct compat_hpet_info { unsigned short hi_timer; }; +/* 32-bit types would lead to different command codes which should be + * translated into 64-bit ones before passed to hpet_ioctl_common + */ +#define COMPAT_HPET_INFO _IOR('h', 0x03, struct compat_hpet_info) +#define COMPAT_HPET_IRQFREQ _IOW('h', 0x6, compat_ulong_t) + static long hpet_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct hpet_info info; int err; + if (cmd == COMPAT_HPET_INFO) + cmd = HPET_INFO; + + if (cmd == COMPAT_HPET_IRQFREQ) + cmd = HPET_IRQFREQ; + mutex_lock(&hpet_mutex); err = hpet_ioctl_common(file->private_data, cmd, arg, &info); mutex_unlock(&hpet_mutex); diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile index 4c695b0388f3..9bb142c75243 100644 --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -16,8 +16,8 @@ tpm-y += eventlog/common.o tpm-y += eventlog/tpm1.o tpm-y += eventlog/tpm2.o tpm-y += tpm-buf.o +tpm-y += tpm2-sessions.o -tpm-$(CONFIG_TCG_TPM2_HMAC) += tpm2-sessions.o tpm-$(CONFIG_ACPI) += tpm_ppi.o eventlog/acpi.o tpm-$(CONFIG_EFI) += eventlog/efi.o tpm-$(CONFIG_OF) += eventlog/of.o diff --git a/drivers/char/tpm/eventlog/common.c b/drivers/char/tpm/eventlog/common.c index 639c3f395a5a..4c0bbba64ee5 100644 --- a/drivers/char/tpm/eventlog/common.c +++ b/drivers/char/tpm/eventlog/common.c @@ -47,6 +47,8 @@ static int tpm_bios_measurements_open(struct inode *inode, if (!err) { seq = file->private_data; seq->private = chip; + } else { + put_device(&chip->dev); } return err; diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c index 907ac9956a78..2281d55df545 100644 --- a/drivers/char/tpm/tpm2-sessions.c +++ b/drivers/char/tpm/tpm2-sessions.c @@ -83,9 +83,6 @@ #define AES_KEY_BYTES AES_KEYSIZE_128 #define AES_KEY_BITS (AES_KEY_BYTES*8) -static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, - u32 *handle, u8 *name); - /* * This is the structure that carries all the auth information (like * session handle, nonces, session key and auth) from use to use it is @@ -148,6 +145,7 @@ struct tpm2_auth { u8 name[AUTH_MAX_NAMES][2 + SHA512_DIGEST_SIZE]; }; +#ifdef CONFIG_TCG_TPM2_HMAC /* * Name Size based on TPM algorithm (assumes no hash bigger than 255) */ @@ -163,6 +161,226 @@ static u8 name_size(const u8 *name) return size_map[alg] + 2; } +static int tpm2_parse_read_public(char *name, struct tpm_buf *buf) +{ + struct tpm_header *head = (struct tpm_header *)buf->data; + off_t offset = TPM_HEADER_SIZE; + u32 tot_len = be32_to_cpu(head->length); + u32 val; + + /* we're starting after the header so adjust the length */ + tot_len -= TPM_HEADER_SIZE; + + /* skip public */ + val = tpm_buf_read_u16(buf, &offset); + if (val > tot_len) + return -EINVAL; + offset += val; + /* name */ + val = tpm_buf_read_u16(buf, &offset); + if (val != name_size(&buf->data[offset])) + return -EINVAL; + memcpy(name, &buf->data[offset], val); + /* forget the rest */ + return 0; +} + +static int tpm2_read_public(struct tpm_chip *chip, u32 handle, char *name) +{ + struct tpm_buf buf; + int rc; + + rc = tpm_buf_init(&buf, TPM2_ST_NO_SESSIONS, TPM2_CC_READ_PUBLIC); + if (rc) + return rc; + + tpm_buf_append_u32(&buf, handle); + rc = tpm_transmit_cmd(chip, &buf, 0, "read public"); + if (rc == TPM2_RC_SUCCESS) + rc = tpm2_parse_read_public(name, &buf); + + tpm_buf_destroy(&buf); + + return rc; +} +#endif /* CONFIG_TCG_TPM2_HMAC */ + +/** + * tpm_buf_append_name() - add a handle area to the buffer + * @chip: the TPM chip structure + * @buf: The buffer to be appended + * @handle: The handle to be appended + * @name: The name of the handle (may be NULL) + * + * In order to compute session HMACs, we need to know the names of the + * objects pointed to by the handles. For most objects, this is simply + * the actual 4 byte handle or an empty buf (in these cases @name + * should be NULL) but for volatile objects, permanent objects and NV + * areas, the name is defined as the hash (according to the name + * algorithm which should be set to sha256) of the public area to + * which the two byte algorithm id has been appended. For these + * objects, the @name pointer should point to this. If a name is + * required but @name is NULL, then TPM2_ReadPublic() will be called + * on the handle to obtain the name. + * + * As with most tpm_buf operations, success is assumed because failure + * will be caused by an incorrect programming model and indicated by a + * kernel message. + */ +void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, + u32 handle, u8 *name) +{ +#ifdef CONFIG_TCG_TPM2_HMAC + enum tpm2_mso_type mso = tpm2_handle_mso(handle); + struct tpm2_auth *auth; + int slot; +#endif + + if (!tpm2_chip_auth(chip)) { + tpm_buf_append_u32(buf, handle); + /* count the number of handles in the upper bits of flags */ + buf->handles++; + return; + } + +#ifdef CONFIG_TCG_TPM2_HMAC + slot = (tpm_buf_length(buf) - TPM_HEADER_SIZE) / 4; + if (slot >= AUTH_MAX_NAMES) { + dev_err(&chip->dev, "TPM: too many handles\n"); + return; + } + auth = chip->auth; + WARN(auth->session != tpm_buf_length(buf), + "name added in wrong place\n"); + tpm_buf_append_u32(buf, handle); + auth->session += 4; + + if (mso == TPM2_MSO_PERSISTENT || + mso == TPM2_MSO_VOLATILE || + mso == TPM2_MSO_NVRAM) { + if (!name) + tpm2_read_public(chip, handle, auth->name[slot]); + } else { + if (name) + dev_err(&chip->dev, "TPM: Handle does not require name but one is specified\n"); + } + + auth->name_h[slot] = handle; + if (name) + memcpy(auth->name[slot], name, name_size(name)); +#endif +} +EXPORT_SYMBOL_GPL(tpm_buf_append_name); + +/** + * tpm_buf_append_hmac_session() - Append a TPM session element + * @chip: the TPM chip structure + * @buf: The buffer to be appended + * @attributes: The session attributes + * @passphrase: The session authority (NULL if none) + * @passphrase_len: The length of the session authority (0 if none) + * + * This fills in a session structure in the TPM command buffer, except + * for the HMAC which cannot be computed until the command buffer is + * complete. The type of session is controlled by the @attributes, + * the main ones of which are TPM2_SA_CONTINUE_SESSION which means the + * session won't terminate after tpm_buf_check_hmac_response(), + * TPM2_SA_DECRYPT which means this buffers first parameter should be + * encrypted with a session key and TPM2_SA_ENCRYPT, which means the + * response buffer's first parameter needs to be decrypted (confusing, + * but the defines are written from the point of view of the TPM). + * + * Any session appended by this command must be finalized by calling + * tpm_buf_fill_hmac_session() otherwise the HMAC will be incorrect + * and the TPM will reject the command. + * + * As with most tpm_buf operations, success is assumed because failure + * will be caused by an incorrect programming model and indicated by a + * kernel message. + */ +void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, + u8 attributes, u8 *passphrase, + int passphrase_len) +{ +#ifdef CONFIG_TCG_TPM2_HMAC + u8 nonce[SHA256_DIGEST_SIZE]; + struct tpm2_auth *auth; + u32 len; +#endif + + if (!tpm2_chip_auth(chip)) { + /* offset tells us where the sessions area begins */ + int offset = buf->handles * 4 + TPM_HEADER_SIZE; + u32 len = 9 + passphrase_len; + + if (tpm_buf_length(buf) != offset) { + /* not the first session so update the existing length */ + len += get_unaligned_be32(&buf->data[offset]); + put_unaligned_be32(len, &buf->data[offset]); + } else { + tpm_buf_append_u32(buf, len); + } + /* auth handle */ + tpm_buf_append_u32(buf, TPM2_RS_PW); + /* nonce */ + tpm_buf_append_u16(buf, 0); + /* attributes */ + tpm_buf_append_u8(buf, 0); + /* passphrase */ + tpm_buf_append_u16(buf, passphrase_len); + tpm_buf_append(buf, passphrase, passphrase_len); + return; + } + +#ifdef CONFIG_TCG_TPM2_HMAC + /* + * The Architecture Guide requires us to strip trailing zeros + * before computing the HMAC + */ + while (passphrase && passphrase_len > 0 && passphrase[passphrase_len - 1] == '\0') + passphrase_len--; + + auth = chip->auth; + auth->attrs = attributes; + auth->passphrase_len = passphrase_len; + if (passphrase_len) + memcpy(auth->passphrase, passphrase, passphrase_len); + + if (auth->session != tpm_buf_length(buf)) { + /* we're not the first session */ + len = get_unaligned_be32(&buf->data[auth->session]); + if (4 + len + auth->session != tpm_buf_length(buf)) { + WARN(1, "session length mismatch, cannot append"); + return; + } + + /* add our new session */ + len += 9 + 2 * SHA256_DIGEST_SIZE; + put_unaligned_be32(len, &buf->data[auth->session]); + } else { + tpm_buf_append_u32(buf, 9 + 2 * SHA256_DIGEST_SIZE); + } + + /* random number for our nonce */ + get_random_bytes(nonce, sizeof(nonce)); + memcpy(auth->our_nonce, nonce, sizeof(nonce)); + tpm_buf_append_u32(buf, auth->handle); + /* our new nonce */ + tpm_buf_append_u16(buf, SHA256_DIGEST_SIZE); + tpm_buf_append(buf, nonce, SHA256_DIGEST_SIZE); + tpm_buf_append_u8(buf, auth->attrs); + /* and put a placeholder for the hmac */ + tpm_buf_append_u16(buf, SHA256_DIGEST_SIZE); + tpm_buf_append(buf, nonce, SHA256_DIGEST_SIZE); +#endif +} +EXPORT_SYMBOL_GPL(tpm_buf_append_hmac_session); + +#ifdef CONFIG_TCG_TPM2_HMAC + +static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, + u32 *handle, u8 *name); + /* * It turns out the crypto hmac(sha256) is hard for us to consume * because it assumes a fixed key and the TPM seems to change the key @@ -344,82 +562,6 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip) } /** - * tpm_buf_append_hmac_session() - Append a TPM session element - * @chip: the TPM chip structure - * @buf: The buffer to be appended - * @attributes: The session attributes - * @passphrase: The session authority (NULL if none) - * @passphrase_len: The length of the session authority (0 if none) - * - * This fills in a session structure in the TPM command buffer, except - * for the HMAC which cannot be computed until the command buffer is - * complete. The type of session is controlled by the @attributes, - * the main ones of which are TPM2_SA_CONTINUE_SESSION which means the - * session won't terminate after tpm_buf_check_hmac_response(), - * TPM2_SA_DECRYPT which means this buffers first parameter should be - * encrypted with a session key and TPM2_SA_ENCRYPT, which means the - * response buffer's first parameter needs to be decrypted (confusing, - * but the defines are written from the point of view of the TPM). - * - * Any session appended by this command must be finalized by calling - * tpm_buf_fill_hmac_session() otherwise the HMAC will be incorrect - * and the TPM will reject the command. - * - * As with most tpm_buf operations, success is assumed because failure - * will be caused by an incorrect programming model and indicated by a - * kernel message. - */ -void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, - u8 attributes, u8 *passphrase, - int passphrase_len) -{ - u8 nonce[SHA256_DIGEST_SIZE]; - u32 len; - struct tpm2_auth *auth = chip->auth; - - /* - * The Architecture Guide requires us to strip trailing zeros - * before computing the HMAC - */ - while (passphrase && passphrase_len > 0 - && passphrase[passphrase_len - 1] == '\0') - passphrase_len--; - - auth->attrs = attributes; - auth->passphrase_len = passphrase_len; - if (passphrase_len) - memcpy(auth->passphrase, passphrase, passphrase_len); - - if (auth->session != tpm_buf_length(buf)) { - /* we're not the first session */ - len = get_unaligned_be32(&buf->data[auth->session]); - if (4 + len + auth->session != tpm_buf_length(buf)) { - WARN(1, "session length mismatch, cannot append"); - return; - } - - /* add our new session */ - len += 9 + 2 * SHA256_DIGEST_SIZE; - put_unaligned_be32(len, &buf->data[auth->session]); - } else { - tpm_buf_append_u32(buf, 9 + 2 * SHA256_DIGEST_SIZE); - } - - /* random number for our nonce */ - get_random_bytes(nonce, sizeof(nonce)); - memcpy(auth->our_nonce, nonce, sizeof(nonce)); - tpm_buf_append_u32(buf, auth->handle); - /* our new nonce */ - tpm_buf_append_u16(buf, SHA256_DIGEST_SIZE); - tpm_buf_append(buf, nonce, SHA256_DIGEST_SIZE); - tpm_buf_append_u8(buf, auth->attrs); - /* and put a placeholder for the hmac */ - tpm_buf_append_u16(buf, SHA256_DIGEST_SIZE); - tpm_buf_append(buf, nonce, SHA256_DIGEST_SIZE); -} -EXPORT_SYMBOL(tpm_buf_append_hmac_session); - -/** * tpm_buf_fill_hmac_session() - finalize the session HMAC * @chip: the TPM chip structure * @buf: The buffer to be appended @@ -449,6 +591,9 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf) u8 cphash[SHA256_DIGEST_SIZE]; struct sha256_state sctx; + if (!auth) + return; + /* save the command code in BE format */ auth->ordinal = head->ordinal; @@ -567,104 +712,6 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf) } EXPORT_SYMBOL(tpm_buf_fill_hmac_session); -static int tpm2_parse_read_public(char *name, struct tpm_buf *buf) -{ - struct tpm_header *head = (struct tpm_header *)buf->data; - off_t offset = TPM_HEADER_SIZE; - u32 tot_len = be32_to_cpu(head->length); - u32 val; - - /* we're starting after the header so adjust the length */ - tot_len -= TPM_HEADER_SIZE; - - /* skip public */ - val = tpm_buf_read_u16(buf, &offset); - if (val > tot_len) - return -EINVAL; - offset += val; - /* name */ - val = tpm_buf_read_u16(buf, &offset); - if (val != name_size(&buf->data[offset])) - return -EINVAL; - memcpy(name, &buf->data[offset], val); - /* forget the rest */ - return 0; -} - -static int tpm2_read_public(struct tpm_chip *chip, u32 handle, char *name) -{ - struct tpm_buf buf; - int rc; - - rc = tpm_buf_init(&buf, TPM2_ST_NO_SESSIONS, TPM2_CC_READ_PUBLIC); - if (rc) - return rc; - - tpm_buf_append_u32(&buf, handle); - rc = tpm_transmit_cmd(chip, &buf, 0, "read public"); - if (rc == TPM2_RC_SUCCESS) - rc = tpm2_parse_read_public(name, &buf); - - tpm_buf_destroy(&buf); - - return rc; -} - -/** - * tpm_buf_append_name() - add a handle area to the buffer - * @chip: the TPM chip structure - * @buf: The buffer to be appended - * @handle: The handle to be appended - * @name: The name of the handle (may be NULL) - * - * In order to compute session HMACs, we need to know the names of the - * objects pointed to by the handles. For most objects, this is simply - * the actual 4 byte handle or an empty buf (in these cases @name - * should be NULL) but for volatile objects, permanent objects and NV - * areas, the name is defined as the hash (according to the name - * algorithm which should be set to sha256) of the public area to - * which the two byte algorithm id has been appended. For these - * objects, the @name pointer should point to this. If a name is - * required but @name is NULL, then TPM2_ReadPublic() will be called - * on the handle to obtain the name. - * - * As with most tpm_buf operations, success is assumed because failure - * will be caused by an incorrect programming model and indicated by a - * kernel message. - */ -void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, - u32 handle, u8 *name) -{ - enum tpm2_mso_type mso = tpm2_handle_mso(handle); - struct tpm2_auth *auth = chip->auth; - int slot; - - slot = (tpm_buf_length(buf) - TPM_HEADER_SIZE)/4; - if (slot >= AUTH_MAX_NAMES) { - dev_err(&chip->dev, "TPM: too many handles\n"); - return; - } - WARN(auth->session != tpm_buf_length(buf), - "name added in wrong place\n"); - tpm_buf_append_u32(buf, handle); - auth->session += 4; - - if (mso == TPM2_MSO_PERSISTENT || - mso == TPM2_MSO_VOLATILE || - mso == TPM2_MSO_NVRAM) { - if (!name) - tpm2_read_public(chip, handle, auth->name[slot]); - } else { - if (name) - dev_err(&chip->dev, "TPM: Handle does not require name but one is specified\n"); - } - - auth->name_h[slot] = handle; - if (name) - memcpy(auth->name[slot], name, name_size(name)); -} -EXPORT_SYMBOL(tpm_buf_append_name); - /** * tpm_buf_check_hmac_response() - check the TPM return HMAC for correctness * @chip: the TPM chip structure @@ -705,6 +752,9 @@ int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf, u32 cc = be32_to_cpu(auth->ordinal); int parm_len, len, i, handles; + if (!auth) + return rc; + if (auth->session >= TPM_HEADER_SIZE) { WARN(1, "tpm session not filled correctly\n"); goto out; @@ -824,8 +874,13 @@ EXPORT_SYMBOL(tpm_buf_check_hmac_response); */ void tpm2_end_auth_session(struct tpm_chip *chip) { - tpm2_flush_context(chip, chip->auth->handle); - memzero_explicit(chip->auth, sizeof(*chip->auth)); + struct tpm2_auth *auth = chip->auth; + + if (!auth) + return; + + tpm2_flush_context(chip, auth->handle); + memzero_explicit(auth, sizeof(*auth)); } EXPORT_SYMBOL(tpm2_end_auth_session); @@ -907,6 +962,11 @@ int tpm2_start_auth_session(struct tpm_chip *chip) int rc; u32 null_key; + if (!auth) { + dev_warn_once(&chip->dev, "auth session is not active\n"); + return 0; + } + rc = tpm2_load_null(chip, &null_key); if (rc) goto out; @@ -1301,3 +1361,4 @@ int tpm2_sessions_init(struct tpm_chip *chip) return rc; } +#endif /* CONFIG_TCG_TPM2_HMAC */ diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c index c9eca24bbad4..61b42c83ced8 100644 --- a/drivers/char/tpm/tpm_tis_spi_main.c +++ b/drivers/char/tpm/tpm_tis_spi_main.c @@ -318,6 +318,7 @@ static void tpm_tis_spi_remove(struct spi_device *dev) } static const struct spi_device_id tpm_tis_spi_id[] = { + { "attpm20p", (unsigned long)tpm_tis_spi_probe }, { "st33htpm-spi", (unsigned long)tpm_tis_spi_probe }, { "slb9670", (unsigned long)tpm_tis_spi_probe }, { "tpm_tis_spi", (unsigned long)tpm_tis_spi_probe }, diff --git a/drivers/clk/mediatek/clk-mt8183-mfgcfg.c b/drivers/clk/mediatek/clk-mt8183-mfgcfg.c index ba504e19d420..62d876e150e1 100644 --- a/drivers/clk/mediatek/clk-mt8183-mfgcfg.c +++ b/drivers/clk/mediatek/clk-mt8183-mfgcfg.c @@ -29,6 +29,7 @@ static const struct mtk_gate mfg_clks[] = { static const struct mtk_clk_desc mfg_desc = { .clks = mfg_clks, .num_clks = ARRAY_SIZE(mfg_clks), + .need_runtime_pm = true, }; static const struct of_device_id of_match_clk_mt8183_mfg[] = { diff --git a/drivers/clk/mediatek/clk-mtk.c b/drivers/clk/mediatek/clk-mtk.c index bd37ab4d1a9b..ba1d1c495bc2 100644 --- a/drivers/clk/mediatek/clk-mtk.c +++ b/drivers/clk/mediatek/clk-mtk.c @@ -496,14 +496,16 @@ static int __mtk_clk_simple_probe(struct platform_device *pdev, } - devm_pm_runtime_enable(&pdev->dev); - /* - * Do a pm_runtime_resume_and_get() to workaround a possible - * deadlock between clk_register() and the genpd framework. - */ - r = pm_runtime_resume_and_get(&pdev->dev); - if (r) - return r; + if (mcd->need_runtime_pm) { + devm_pm_runtime_enable(&pdev->dev); + /* + * Do a pm_runtime_resume_and_get() to workaround a possible + * deadlock between clk_register() and the genpd framework. + */ + r = pm_runtime_resume_and_get(&pdev->dev); + if (r) + return r; + } /* Calculate how many clk_hw_onecell_data entries to allocate */ num_clks = mcd->num_clks + mcd->num_composite_clks; @@ -585,7 +587,8 @@ static int __mtk_clk_simple_probe(struct platform_device *pdev, goto unregister_clks; } - pm_runtime_put(&pdev->dev); + if (mcd->need_runtime_pm) + pm_runtime_put(&pdev->dev); return r; @@ -618,7 +621,8 @@ free_base: if (mcd->shared_io && base) iounmap(base); - pm_runtime_put(&pdev->dev); + if (mcd->need_runtime_pm) + pm_runtime_put(&pdev->dev); return r; } diff --git a/drivers/clk/mediatek/clk-mtk.h b/drivers/clk/mediatek/clk-mtk.h index 22096501a60a..c17fe1c2d732 100644 --- a/drivers/clk/mediatek/clk-mtk.h +++ b/drivers/clk/mediatek/clk-mtk.h @@ -237,6 +237,8 @@ struct mtk_clk_desc { int (*clk_notifier_func)(struct device *dev, struct clk *clk); unsigned int mfg_clk_idx; + + bool need_runtime_pm; }; int mtk_clk_pdev_probe(struct platform_device *pdev); diff --git a/drivers/clk/qcom/apss-ipq-pll.c b/drivers/clk/qcom/apss-ipq-pll.c index 5f7f537e4ecb..e8632db2c542 100644 --- a/drivers/clk/qcom/apss-ipq-pll.c +++ b/drivers/clk/qcom/apss-ipq-pll.c @@ -70,7 +70,6 @@ static struct clk_alpha_pll ipq_pll_stromer_plus = { static const struct alpha_pll_config ipq5018_pll_config = { .l = 0x2a, .config_ctl_val = 0x4001075b, - .config_ctl_hi_val = 0x304, .main_output_mask = BIT(0), .aux_output_mask = BIT(1), .early_output_mask = BIT(3), @@ -84,7 +83,6 @@ static const struct alpha_pll_config ipq5018_pll_config = { static const struct alpha_pll_config ipq5332_pll_config = { .l = 0x2d, .config_ctl_val = 0x4001075b, - .config_ctl_hi_val = 0x304, .main_output_mask = BIT(0), .aux_output_mask = BIT(1), .early_output_mask = BIT(3), diff --git a/drivers/clk/qcom/clk-alpha-pll.c b/drivers/clk/qcom/clk-alpha-pll.c index d4227909d1fe..c51647e37df8 100644 --- a/drivers/clk/qcom/clk-alpha-pll.c +++ b/drivers/clk/qcom/clk-alpha-pll.c @@ -2574,6 +2574,9 @@ static int clk_alpha_pll_stromer_plus_set_rate(struct clk_hw *hw, regmap_write(pll->clkr.regmap, PLL_ALPHA_VAL_U(pll), a >> ALPHA_BITWIDTH); + regmap_update_bits(pll->clkr.regmap, PLL_USER_CTL(pll), + PLL_ALPHA_EN, PLL_ALPHA_EN); + regmap_write(pll->clkr.regmap, PLL_MODE(pll), PLL_BYPASSNL); /* Wait five micro seconds or more */ diff --git a/drivers/clk/qcom/gcc-ipq9574.c b/drivers/clk/qcom/gcc-ipq9574.c index 0a3f846695b8..f8b9a1e93bef 100644 --- a/drivers/clk/qcom/gcc-ipq9574.c +++ b/drivers/clk/qcom/gcc-ipq9574.c @@ -2140,9 +2140,10 @@ static struct clk_rcg2 pcnoc_bfdcd_clk_src = { static struct clk_branch gcc_crypto_axi_clk = { .halt_reg = 0x16010, + .halt_check = BRANCH_HALT_VOTED, .clkr = { - .enable_reg = 0x16010, - .enable_mask = BIT(0), + .enable_reg = 0xb004, + .enable_mask = BIT(15), .hw.init = &(const struct clk_init_data) { .name = "gcc_crypto_axi_clk", .parent_hws = (const struct clk_hw *[]) { @@ -2156,9 +2157,10 @@ static struct clk_branch gcc_crypto_axi_clk = { static struct clk_branch gcc_crypto_ahb_clk = { .halt_reg = 0x16014, + .halt_check = BRANCH_HALT_VOTED, .clkr = { - .enable_reg = 0x16014, - .enable_mask = BIT(0), + .enable_reg = 0xb004, + .enable_mask = BIT(16), .hw.init = &(const struct clk_init_data) { .name = "gcc_crypto_ahb_clk", .parent_hws = (const struct clk_hw *[]) { diff --git a/drivers/clk/qcom/gcc-sm6350.c b/drivers/clk/qcom/gcc-sm6350.c index cf4a7b6e0b23..0559a33faf00 100644 --- a/drivers/clk/qcom/gcc-sm6350.c +++ b/drivers/clk/qcom/gcc-sm6350.c @@ -100,8 +100,8 @@ static struct clk_alpha_pll gpll6 = { .enable_mask = BIT(6), .hw.init = &(struct clk_init_data){ .name = "gpll6", - .parent_hws = (const struct clk_hw*[]){ - &gpll0.clkr.hw, + .parent_data = &(const struct clk_parent_data){ + .fw_name = "bi_tcxo", }, .num_parents = 1, .ops = &clk_alpha_pll_fixed_fabia_ops, @@ -124,7 +124,7 @@ static struct clk_alpha_pll_postdiv gpll6_out_even = { .clkr.hw.init = &(struct clk_init_data){ .name = "gpll6_out_even", .parent_hws = (const struct clk_hw*[]){ - &gpll0.clkr.hw, + &gpll6.clkr.hw, }, .num_parents = 1, .ops = &clk_alpha_pll_postdiv_fabia_ops, @@ -139,8 +139,8 @@ static struct clk_alpha_pll gpll7 = { .enable_mask = BIT(7), .hw.init = &(struct clk_init_data){ .name = "gpll7", - .parent_hws = (const struct clk_hw*[]){ - &gpll0.clkr.hw, + .parent_data = &(const struct clk_parent_data){ + .fw_name = "bi_tcxo", }, .num_parents = 1, .ops = &clk_alpha_pll_fixed_fabia_ops, diff --git a/drivers/clk/sunxi-ng/ccu_common.c b/drivers/clk/sunxi-ng/ccu_common.c index ac0091b4ce24..be375ce0149c 100644 --- a/drivers/clk/sunxi-ng/ccu_common.c +++ b/drivers/clk/sunxi-ng/ccu_common.c @@ -132,7 +132,6 @@ static int sunxi_ccu_probe(struct sunxi_ccu *ccu, struct device *dev, for (i = 0; i < desc->hw_clks->num ; i++) { struct clk_hw *hw = desc->hw_clks->hws[i]; - struct ccu_common *common = hw_to_ccu_common(hw); const char *name; if (!hw) @@ -147,14 +146,21 @@ static int sunxi_ccu_probe(struct sunxi_ccu *ccu, struct device *dev, pr_err("Couldn't register clock %d - %s\n", i, name); goto err_clk_unreg; } + } + + for (i = 0; i < desc->num_ccu_clks; i++) { + struct ccu_common *cclk = desc->ccu_clks[i]; + + if (!cclk) + continue; - if (common->max_rate) - clk_hw_set_rate_range(hw, common->min_rate, - common->max_rate); + if (cclk->max_rate) + clk_hw_set_rate_range(&cclk->hw, cclk->min_rate, + cclk->max_rate); else - WARN(common->min_rate, + WARN(cclk->min_rate, "No max_rate, ignoring min_rate of clock %d - %s\n", - i, name); + i, clk_hw_get_name(&cclk->hw)); } ret = of_clk_add_hw_provider(node, of_clk_hw_onecell_get, diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 34faa0320ece..95dd4660b5b6 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -134,6 +134,16 @@ config RDA_TIMER help Enables the support for the RDA Micro timer driver. +config REALTEK_OTTO_TIMER + bool "Clocksource/timer for the Realtek Otto platform" if COMPILE_TEST + select TIMER_OF + help + This driver adds support for the timers found in the Realtek RTL83xx + and RTL93xx SoCs series. This includes chips such as RTL8380, RTL8381 + and RTL832, as well as chips from the RTL839x series, such as RTL8390 + RT8391, RTL8392, RTL8393 and RTL8396 and chips of the RTL930x series + such as RTL9301, RTL9302 or RTL9303. + config SUN4I_TIMER bool "Sun4i timer driver" if COMPILE_TEST depends on HAS_IOMEM diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 4bb856e4df55..22743785299e 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_MILBEAUT_TIMER) += timer-milbeaut.o obj-$(CONFIG_SPRD_TIMER) += timer-sprd.o obj-$(CONFIG_NPCM7XX_TIMER) += timer-npcm7xx.o obj-$(CONFIG_RDA_TIMER) += timer-rda.o +obj-$(CONFIG_REALTEK_OTTO_TIMER) += timer-rtl-otto.o obj-$(CONFIG_ARC_TIMERS) += arc_timer.o obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 5bb43cc1a8df..aeafc74181f0 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1556,7 +1556,7 @@ static int __init arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame) { void __iomem *base; - int ret, irq = 0; + int ret, irq; if (arch_timer_mem_use_virtual) irq = frame->virt_irq; diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index ab1c8c2b66b8..a05cfaab5f84 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -343,7 +343,7 @@ static int __init global_timer_of_register(struct device_node *np) { struct clk *gt_clk; static unsigned long gt_clk_rate; - int err = 0; + int err; /* * In A9 r2p0 the comparators for each processor with the global timer diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index b3ae38f36720..110347707ff9 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -19,6 +19,7 @@ static DEFINE_PER_CPU(struct clock_event_device, gic_clockevent_device); static int gic_timer_irq; static unsigned int gic_frequency; +static unsigned int gic_count_width; static bool __read_mostly gic_clock_unstable; static void gic_clocksource_unstable(char *reason); @@ -186,18 +187,21 @@ static void gic_clocksource_unstable(char *reason) static int __init __gic_clocksource_init(void) { - unsigned int count_width; int ret; /* Set clocksource mask. */ - count_width = read_gic_config() & GIC_CONFIG_COUNTBITS; - count_width >>= __ffs(GIC_CONFIG_COUNTBITS); - count_width *= 4; - count_width += 32; - gic_clocksource.mask = CLOCKSOURCE_MASK(count_width); + gic_count_width = read_gic_config() & GIC_CONFIG_COUNTBITS; + gic_count_width >>= __ffs(GIC_CONFIG_COUNTBITS); + gic_count_width *= 4; + gic_count_width += 32; + gic_clocksource.mask = CLOCKSOURCE_MASK(gic_count_width); /* Calculate a somewhat reasonable rating value. */ - gic_clocksource.rating = 200 + gic_frequency / 10000000; + if (mips_cm_revision() >= CM_REV_CM3 || !IS_ENABLED(CONFIG_CPU_FREQ)) + gic_clocksource.rating = 300; /* Good when frequecy is stable */ + else + gic_clocksource.rating = 200; + gic_clocksource.rating += clamp(gic_frequency / 10000000, 0, 99); ret = clocksource_register_hz(&gic_clocksource, gic_frequency); if (ret < 0) @@ -260,7 +264,7 @@ static int __init gic_clocksource_of_init(struct device_node *node) if (mips_cm_revision() >= CM_REV_CM3 || !IS_ENABLED(CONFIG_CPU_FREQ)) { sched_clock_register(mips_cm_is64 ? gic_read_count_64 : gic_read_count_2x32, - 64, gic_frequency); + gic_count_width, gic_frequency); } return 0; diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 26919556ef5f..b72b36e0abed 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -528,6 +528,7 @@ static void sh_cmt_set_next(struct sh_cmt_channel *ch, unsigned long delta) static irqreturn_t sh_cmt_interrupt(int irq, void *dev_id) { struct sh_cmt_channel *ch = dev_id; + unsigned long flags; /* clear flags */ sh_cmt_write_cmcsr(ch, sh_cmt_read_cmcsr(ch) & @@ -558,6 +559,8 @@ static irqreturn_t sh_cmt_interrupt(int irq, void *dev_id) ch->flags &= ~FLAG_SKIPEVENT; + raw_spin_lock_irqsave(&ch->lock, flags); + if (ch->flags & FLAG_REPROGRAM) { ch->flags &= ~FLAG_REPROGRAM; sh_cmt_clock_event_program_verify(ch, 1); @@ -570,6 +573,8 @@ static irqreturn_t sh_cmt_interrupt(int irq, void *dev_id) ch->flags &= ~FLAG_IRQCONTEXT; + raw_spin_unlock_irqrestore(&ch->lock, flags); + return IRQ_HANDLED; } @@ -780,12 +785,18 @@ static int sh_cmt_clock_event_next(unsigned long delta, struct clock_event_device *ced) { struct sh_cmt_channel *ch = ced_to_sh_cmt(ced); + unsigned long flags; BUG_ON(!clockevent_state_oneshot(ced)); + + raw_spin_lock_irqsave(&ch->lock, flags); + if (likely(ch->flags & FLAG_IRQCONTEXT)) ch->next_match_value = delta - 1; else - sh_cmt_set_next(ch, delta - 1); + __sh_cmt_set_next(ch, delta - 1); + + raw_spin_unlock_irqrestore(&ch->lock, flags); return 0; } diff --git a/drivers/clocksource/timer-rtl-otto.c b/drivers/clocksource/timer-rtl-otto.c new file mode 100644 index 000000000000..8a3068b36e75 --- /dev/null +++ b/drivers/clocksource/timer-rtl-otto.c @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/clk.h> +#include <linux/clockchips.h> +#include <linux/cpu.h> +#include <linux/cpuhotplug.h> +#include <linux/cpumask.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/jiffies.h> +#include <linux/printk.h> +#include <linux/sched_clock.h> +#include "timer-of.h" + +#define RTTM_DATA 0x0 +#define RTTM_CNT 0x4 +#define RTTM_CTRL 0x8 +#define RTTM_INT 0xc + +#define RTTM_CTRL_ENABLE BIT(28) +#define RTTM_INT_PENDING BIT(16) +#define RTTM_INT_ENABLE BIT(20) + +/* + * The Otto platform provides multiple 28 bit timers/counters with the following + * operating logic. If enabled the timer counts up. Per timer one can set a + * maximum counter value as an end marker. If end marker is reached the timer + * fires an interrupt. If the timer "overflows" by reaching the end marker or + * by adding 1 to 0x0fffffff the counter is reset to 0. When this happens and + * the timer is in operating mode COUNTER it stops. In mode TIMER it will + * continue to count up. + */ +#define RTTM_CTRL_COUNTER 0 +#define RTTM_CTRL_TIMER BIT(24) + +#define RTTM_BIT_COUNT 28 +#define RTTM_MIN_DELTA 8 +#define RTTM_MAX_DELTA CLOCKSOURCE_MASK(28) + +/* + * Timers are derived from the LXB clock frequency. Usually this is a fixed + * multiple of the 25 MHz oscillator. The 930X SOC is an exception from that. + * Its LXB clock has only dividers and uses the switch PLL of 2.45 GHz as its + * base. The only meaningful frequencies we can achieve from that are 175.000 + * MHz and 153.125 MHz. The greatest common divisor of all explained possible + * speeds is 3125000. Pin the timers to this 3.125 MHz reference frequency. + */ +#define RTTM_TICKS_PER_SEC 3125000 + +struct rttm_cs { + struct timer_of to; + struct clocksource cs; +}; + +/* Simple internal register functions */ +static inline void rttm_set_counter(void __iomem *base, unsigned int counter) +{ + iowrite32(counter, base + RTTM_CNT); +} + +static inline unsigned int rttm_get_counter(void __iomem *base) +{ + return ioread32(base + RTTM_CNT); +} + +static inline void rttm_set_period(void __iomem *base, unsigned int period) +{ + iowrite32(period, base + RTTM_DATA); +} + +static inline void rttm_disable_timer(void __iomem *base) +{ + iowrite32(0, base + RTTM_CTRL); +} + +static inline void rttm_enable_timer(void __iomem *base, u32 mode, u32 divisor) +{ + iowrite32(RTTM_CTRL_ENABLE | mode | divisor, base + RTTM_CTRL); +} + +static inline void rttm_ack_irq(void __iomem *base) +{ + iowrite32(ioread32(base + RTTM_INT) | RTTM_INT_PENDING, base + RTTM_INT); +} + +static inline void rttm_enable_irq(void __iomem *base) +{ + iowrite32(RTTM_INT_ENABLE, base + RTTM_INT); +} + +static inline void rttm_disable_irq(void __iomem *base) +{ + iowrite32(0, base + RTTM_INT); +} + +/* Aggregated control functions for kernel clock framework */ +#define RTTM_DEBUG(base) \ + pr_debug("------------- %d %p\n", \ + smp_processor_id(), base) + +static irqreturn_t rttm_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *clkevt = dev_id; + struct timer_of *to = to_timer_of(clkevt); + + rttm_ack_irq(to->of_base.base); + RTTM_DEBUG(to->of_base.base); + clkevt->event_handler(clkevt); + + return IRQ_HANDLED; +} + +static void rttm_stop_timer(void __iomem *base) +{ + rttm_disable_timer(base); + rttm_ack_irq(base); +} + +static void rttm_start_timer(struct timer_of *to, u32 mode) +{ + rttm_set_counter(to->of_base.base, 0); + rttm_enable_timer(to->of_base.base, mode, to->of_clk.rate / RTTM_TICKS_PER_SEC); +} + +static int rttm_next_event(unsigned long delta, struct clock_event_device *clkevt) +{ + struct timer_of *to = to_timer_of(clkevt); + + RTTM_DEBUG(to->of_base.base); + rttm_stop_timer(to->of_base.base); + rttm_set_period(to->of_base.base, delta); + rttm_start_timer(to, RTTM_CTRL_COUNTER); + + return 0; +} + +static int rttm_state_oneshot(struct clock_event_device *clkevt) +{ + struct timer_of *to = to_timer_of(clkevt); + + RTTM_DEBUG(to->of_base.base); + rttm_stop_timer(to->of_base.base); + rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); + rttm_start_timer(to, RTTM_CTRL_COUNTER); + + return 0; +} + +static int rttm_state_periodic(struct clock_event_device *clkevt) +{ + struct timer_of *to = to_timer_of(clkevt); + + RTTM_DEBUG(to->of_base.base); + rttm_stop_timer(to->of_base.base); + rttm_set_period(to->of_base.base, RTTM_TICKS_PER_SEC / HZ); + rttm_start_timer(to, RTTM_CTRL_TIMER); + + return 0; +} + +static int rttm_state_shutdown(struct clock_event_device *clkevt) +{ + struct timer_of *to = to_timer_of(clkevt); + + RTTM_DEBUG(to->of_base.base); + rttm_stop_timer(to->of_base.base); + + return 0; +} + +static void rttm_setup_timer(void __iomem *base) +{ + RTTM_DEBUG(base); + rttm_stop_timer(base); + rttm_set_period(base, 0); +} + +static u64 rttm_read_clocksource(struct clocksource *cs) +{ + struct rttm_cs *rcs = container_of(cs, struct rttm_cs, cs); + + return rttm_get_counter(rcs->to.of_base.base); +} + +/* Module initialization part. */ +static DEFINE_PER_CPU(struct timer_of, rttm_to) = { + .flags = TIMER_OF_BASE | TIMER_OF_CLOCK | TIMER_OF_IRQ, + .of_irq = { + .flags = IRQF_PERCPU | IRQF_TIMER, + .handler = rttm_timer_interrupt, + }, + .clkevt = { + .rating = 400, + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_state_periodic = rttm_state_periodic, + .set_state_shutdown = rttm_state_shutdown, + .set_state_oneshot = rttm_state_oneshot, + .set_next_event = rttm_next_event + }, +}; + +static int rttm_enable_clocksource(struct clocksource *cs) +{ + struct rttm_cs *rcs = container_of(cs, struct rttm_cs, cs); + + rttm_disable_irq(rcs->to.of_base.base); + rttm_setup_timer(rcs->to.of_base.base); + rttm_enable_timer(rcs->to.of_base.base, RTTM_CTRL_TIMER, + rcs->to.of_clk.rate / RTTM_TICKS_PER_SEC); + + return 0; +} + +struct rttm_cs rttm_cs = { + .to = { + .flags = TIMER_OF_BASE | TIMER_OF_CLOCK, + }, + .cs = { + .name = "realtek_otto_timer", + .rating = 400, + .mask = CLOCKSOURCE_MASK(RTTM_BIT_COUNT), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .read = rttm_read_clocksource, + } +}; + +static u64 notrace rttm_read_clock(void) +{ + return rttm_get_counter(rttm_cs.to.of_base.base); +} + +static int rttm_cpu_starting(unsigned int cpu) +{ + struct timer_of *to = per_cpu_ptr(&rttm_to, cpu); + + RTTM_DEBUG(to->of_base.base); + to->clkevt.cpumask = cpumask_of(cpu); + irq_force_affinity(to->of_irq.irq, to->clkevt.cpumask); + clockevents_config_and_register(&to->clkevt, RTTM_TICKS_PER_SEC, + RTTM_MIN_DELTA, RTTM_MAX_DELTA); + rttm_enable_irq(to->of_base.base); + + return 0; +} + +static int __init rttm_probe(struct device_node *np) +{ + unsigned int cpu, cpu_rollback; + struct timer_of *to; + unsigned int clkidx = num_possible_cpus(); + + /* Use the first n timers as per CPU clock event generators */ + for_each_possible_cpu(cpu) { + to = per_cpu_ptr(&rttm_to, cpu); + to->of_irq.index = to->of_base.index = cpu; + if (timer_of_init(np, to)) { + pr_err("setup of timer %d failed\n", cpu); + goto rollback; + } + rttm_setup_timer(to->of_base.base); + } + + /* Activate the n'th + 1 timer as a stable CPU clocksource. */ + to = &rttm_cs.to; + to->of_base.index = clkidx; + timer_of_init(np, to); + if (rttm_cs.to.of_base.base && rttm_cs.to.of_clk.rate) { + rttm_enable_clocksource(&rttm_cs.cs); + clocksource_register_hz(&rttm_cs.cs, RTTM_TICKS_PER_SEC); + sched_clock_register(rttm_read_clock, RTTM_BIT_COUNT, RTTM_TICKS_PER_SEC); + } else + pr_err(" setup of timer %d as clocksource failed", clkidx); + + return cpuhp_setup_state(CPUHP_AP_REALTEK_TIMER_STARTING, + "timer/realtek:online", + rttm_cpu_starting, NULL); +rollback: + pr_err("timer registration failed\n"); + for_each_possible_cpu(cpu_rollback) { + if (cpu_rollback == cpu) + break; + to = per_cpu_ptr(&rttm_to, cpu_rollback); + timer_of_cleanup(to); + } + + return -EINVAL; +} + +TIMER_OF_DECLARE(otto_timer, "realtek,otto-timer", rttm_probe); diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 37f1cdf46d29..4ac3a35dcd98 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -890,8 +890,10 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); - if (acpi_cpufreq_driver.set_boost) + if (acpi_cpufreq_driver.set_boost) { set_boost(policy, acpi_cpufreq_driver.boost_enabled); + policy->boost_enabled = acpi_cpufreq_driver.boost_enabled; + } return result; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a45aac17c20f..9e5060b27864 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1431,7 +1431,8 @@ static int cpufreq_online(unsigned int cpu) } /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ - policy->boost_enabled = cpufreq_boost_enabled() && policy_has_boost_freq(policy); + if (cpufreq_boost_enabled() && policy_has_boost_freq(policy)) + policy->boost_enabled = true; /* * The initialization has succeeded and the policy is online. diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 784843fa2a22..3df10517a327 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -52,6 +52,14 @@ int devm_cxl_add_passthrough_decoder(struct cxl_port *port) struct cxl_dport *dport = NULL; int single_port_map[1]; unsigned long index; + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + + /* + * Capability checks are moot for passthrough decoders, support + * any and all possibilities. + */ + cxlhdm->interleave_mask = ~0U; + cxlhdm->iw_cap_mask = ~0UL; cxlsd = cxl_switch_decoder_alloc(port, 1); if (IS_ERR(cxlsd)) @@ -79,6 +87,11 @@ static void parse_hdm_decoder_caps(struct cxl_hdm *cxlhdm) cxlhdm->interleave_mask |= GENMASK(11, 8); if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_14_12, hdm_cap)) cxlhdm->interleave_mask |= GENMASK(14, 12); + cxlhdm->iw_cap_mask = BIT(1) | BIT(2) | BIT(4) | BIT(8); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY, hdm_cap)) + cxlhdm->iw_cap_mask |= BIT(3) | BIT(6) | BIT(12); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_16_WAY, hdm_cap)) + cxlhdm->iw_cap_mask |= BIT(16); } static bool should_emulate_decoders(struct cxl_endpoint_dvsec_info *info) diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index e69625a8d6a1..c00f3a933164 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -62,10 +62,14 @@ static int match_nvdimm_bridge(struct device *dev, void *data) return is_cxl_nvdimm_bridge(dev); } -struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd) +/** + * cxl_find_nvdimm_bridge() - find a bridge device relative to a port + * @port: any descendant port of an nvdimm-bridge associated + * root-cxl-port + */ +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port) { - struct cxl_root *cxl_root __free(put_cxl_root) = - find_cxl_root(cxlmd->endpoint); + struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); struct device *dev; if (!cxl_root) @@ -242,18 +246,20 @@ static void cxlmd_release_nvdimm(void *_cxlmd) /** * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm + * @parent_port: parent port for the (to be added) @cxlmd endpoint port * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations * * Return: 0 on success negative error code on failure. */ -int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd) +int devm_cxl_add_nvdimm(struct cxl_port *parent_port, + struct cxl_memdev *cxlmd) { struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_nvdimm *cxl_nvd; struct device *dev; int rc; - cxl_nvb = cxl_find_nvdimm_bridge(cxlmd); + cxl_nvb = cxl_find_nvdimm_bridge(parent_port); if (!cxl_nvb) return -ENODEV; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 3c2b6144be23..538ebd5a64fd 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1101,6 +1101,26 @@ static int cxl_port_attach_region(struct cxl_port *port, } cxld = cxl_rr->decoder; + /* + * the number of targets should not exceed the target_count + * of the decoder + */ + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd; + + cxlsd = to_cxl_switch_decoder(&cxld->dev); + if (cxl_rr->nr_targets > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s %s add: %s:%s @ %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), dev_name(&cxlmd->dev), + dev_name(&cxled->cxld.dev), pos, + cxlsd->nr_targets); + rc = -ENXIO; + goto out_erase; + } + } + rc = cxl_rr_ep_add(cxl_rr, cxled); if (rc) { dev_dbg(&cxlr->dev, @@ -1210,6 +1230,50 @@ static int check_last_peer(struct cxl_endpoint_decoder *cxled, return 0; } +static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + unsigned int interleave_mask; + u8 eiw; + u16 eig; + int high_pos, low_pos; + + if (!test_bit(iw, &cxlhdm->iw_cap_mask)) + return -ENXIO; + /* + * Per CXL specification r3.1(8.2.4.20.13 Decoder Protection), + * if eiw < 8: + * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + 8 + eiw] + * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0] + * + * when the eiw is 0, all the bits of HPAOFFSET[51: 0] are used, the + * interleave bits are none. + * + * if eiw >= 8: + * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + eiw] / 3 + * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0] + * + * when the eiw is 8, all the bits of HPAOFFSET[51: 0] are used, the + * interleave bits are none. + */ + ways_to_eiw(iw, &eiw); + if (eiw == 0 || eiw == 8) + return 0; + + granularity_to_eig(ig, &eig); + if (eiw > 8) + high_pos = eiw + eig - 1; + else + high_pos = eiw + eig + 7; + low_pos = eig + 8; + interleave_mask = GENMASK(high_pos, low_pos); + if (interleave_mask & ~cxlhdm->interleave_mask) + return -ENXIO; + + return 0; +} + static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) @@ -1360,6 +1424,15 @@ static int cxl_port_setup_targets(struct cxl_port *port, return -ENXIO; } } else { + rc = check_interleave_cap(cxld, iw, ig); + if (rc) { + dev_dbg(&cxlr->dev, + "%s:%s iw: %d ig: %d is not supported\n", + dev_name(port->uport_dev), + dev_name(&port->dev), iw, ig); + return rc; + } + cxld->interleave_ways = iw; cxld->interleave_granularity = ig; cxld->hpa_range = (struct range) { @@ -1796,6 +1869,15 @@ static int cxl_region_attach(struct cxl_region *cxlr, struct cxl_dport *dport; int rc = -ENXIO; + rc = check_interleave_cap(&cxled->cxld, p->interleave_ways, + p->interleave_granularity); + if (rc) { + dev_dbg(&cxlr->dev, "%s iw: %d ig: %d is not supported\n", + dev_name(&cxled->cxld.dev), p->interleave_ways, + p->interleave_granularity); + return rc; + } + if (cxled->mode != cxlr->mode) { dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n", dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode); @@ -2688,22 +2770,33 @@ static int __cxl_dpa_to_region(struct device *dev, void *arg) { struct cxl_dpa_to_region_context *ctx = arg; struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; u64 dpa = ctx->dpa; if (!is_endpoint_decoder(dev)) return 0; cxled = to_cxl_endpoint_decoder(dev); - if (!cxled->dpa_res || !resource_size(cxled->dpa_res)) + if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res)) return 0; if (dpa > cxled->dpa_res->end || dpa < cxled->dpa_res->start) return 0; - dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa, - dev_name(&cxled->cxld.region->dev)); + /* + * Stop the region search (return 1) when an endpoint mapping is + * found. The region may not be fully constructed so offering + * the cxlr in the context structure is not guaranteed. + */ + cxlr = cxled->cxld.region; + if (cxlr) + dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa, + dev_name(&cxlr->dev)); + else + dev_dbg(dev, "dpa:0x%llx mapped in endpoint:%s\n", dpa, + dev_name(dev)); - ctx->cxlr = cxled->cxld.region; + ctx->cxlr = cxlr; return 1; } @@ -2847,7 +2940,7 @@ static int cxl_pmem_region_alloc(struct cxl_region *cxlr) * bridge for one device is the same for all. */ if (i == 0) { - cxl_nvb = cxl_find_nvdimm_bridge(cxlmd); + cxl_nvb = cxl_find_nvdimm_bridge(cxlmd->endpoint); if (!cxl_nvb) return -ENODEV; cxlr->cxl_nvb = cxl_nvb; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 603c0120cff8..a6613a6f8923 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -47,6 +47,8 @@ extern const struct nvdimm_security_ops *cxl_security_ops; #define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) #define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) #define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) #define CXL_HDM_DECODER_CTRL_OFFSET 0x4 #define CXL_HDM_DECODER_ENABLE BIT(1) #define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) @@ -855,8 +857,8 @@ struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm_bridge(struct device *dev); -int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd); -struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_memdev *cxlmd); +int devm_cxl_add_nvdimm(struct cxl_port *parent_port, struct cxl_memdev *cxlmd); +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port); #ifdef CONFIG_CXL_REGION bool is_cxl_pmem_region(struct device *dev); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 19aba81cdf13..af8169ccdbc0 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -395,9 +395,9 @@ enum cxl_devtype { /** * struct cxl_dpa_perf - DPA performance property entry - * @dpa_range - range for DPA address - * @coord - QoS performance data (i.e. latency, bandwidth) - * @qos_class - QoS Class cookies + * @dpa_range: range for DPA address + * @coord: QoS performance data (i.e. latency, bandwidth) + * @qos_class: QoS Class cookies */ struct cxl_dpa_perf { struct range dpa_range; @@ -464,13 +464,14 @@ struct cxl_dev_state { * @active_persistent_bytes: sum of hard + soft persistent * @next_volatile_bytes: volatile capacity change pending device reset * @next_persistent_bytes: persistent capacity change pending device reset + * @ram_perf: performance data entry matched to RAM partition + * @pmem_perf: performance data entry matched to PMEM partition * @event: event log driver state * @poison: poison driver state info * @security: security driver state info * @fw: firmware upload / activation state + * @mbox_wait: RCU wait for mbox send completely * @mbox_send: @dev specific transport for transmitting mailbox commands - * @ram_perf: performance data entry matched to RAM partition - * @pmem_perf: performance data entry matched to PMEM partition * * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for * details on capacity parameters. @@ -851,11 +852,21 @@ static inline void cxl_mem_active_dec(void) int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd); +/** + * struct cxl_hdm - HDM Decoder registers and cached / decoded capabilities + * @regs: mapped registers, see devm_cxl_setup_hdm() + * @decoder_count: number of decoders for this port + * @target_count: for switch decoders, max downstream port targets + * @interleave_mask: interleave granularity capability, see check_interleave_cap() + * @iw_cap_mask: bitmask of supported interleave ways, see check_interleave_cap() + * @port: mapped cxl_port, see devm_cxl_setup_hdm() + */ struct cxl_hdm { struct cxl_component_regs regs; unsigned int decoder_count; unsigned int target_count; unsigned int interleave_mask; + unsigned long iw_cap_mask; struct cxl_port *port; }; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 0c79d9ce877c..2f1b49bfe162 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -152,6 +152,15 @@ static int cxl_mem_probe(struct device *dev) return -ENXIO; } + if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) { + rc = devm_cxl_add_nvdimm(parent_port, cxlmd); + if (rc) { + if (rc == -ENODEV) + dev_info(dev, "PMEM disabled by platform\n"); + return rc; + } + } + if (dport->rch) endpoint_parent = parent_port->uport_dev; else @@ -174,14 +183,6 @@ unlock: if (rc) return rc; - if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) { - rc = devm_cxl_add_nvdimm(cxlmd); - if (rc == -ENODEV) - dev_info(dev, "PMEM disabled by platform\n"); - else - return rc; - } - /* * The kernel may be operating out of CXL memory on this device, * there is no spec defined way to determine whether this device diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c index 0d139e4de37c..8a347b938406 100644 --- a/drivers/firmware/cirrus/cs_dsp.c +++ b/drivers/firmware/cirrus/cs_dsp.c @@ -1107,9 +1107,16 @@ struct cs_dsp_coeff_parsed_coeff { int len; }; -static int cs_dsp_coeff_parse_string(int bytes, const u8 **pos, const u8 **str) +static int cs_dsp_coeff_parse_string(int bytes, const u8 **pos, unsigned int avail, + const u8 **str) { - int length; + int length, total_field_len; + + /* String fields are at least one __le32 */ + if (sizeof(__le32) > avail) { + *pos = NULL; + return 0; + } switch (bytes) { case 1: @@ -1122,10 +1129,16 @@ static int cs_dsp_coeff_parse_string(int bytes, const u8 **pos, const u8 **str) return 0; } + total_field_len = ((length + bytes) + 3) & ~0x03; + if ((unsigned int)total_field_len > avail) { + *pos = NULL; + return 0; + } + if (str) *str = *pos + bytes; - *pos += ((length + bytes) + 3) & ~0x03; + *pos += total_field_len; return length; } @@ -1150,71 +1163,134 @@ static int cs_dsp_coeff_parse_int(int bytes, const u8 **pos) return val; } -static inline void cs_dsp_coeff_parse_alg(struct cs_dsp *dsp, const u8 **data, - struct cs_dsp_coeff_parsed_alg *blk) +static int cs_dsp_coeff_parse_alg(struct cs_dsp *dsp, + const struct wmfw_region *region, + struct cs_dsp_coeff_parsed_alg *blk) { const struct wmfw_adsp_alg_data *raw; + unsigned int data_len = le32_to_cpu(region->len); + unsigned int pos; + const u8 *tmp; + + raw = (const struct wmfw_adsp_alg_data *)region->data; switch (dsp->fw_ver) { case 0: case 1: - raw = (const struct wmfw_adsp_alg_data *)*data; - *data = raw->data; + if (sizeof(*raw) > data_len) + return -EOVERFLOW; blk->id = le32_to_cpu(raw->id); blk->name = raw->name; - blk->name_len = strlen(raw->name); + blk->name_len = strnlen(raw->name, ARRAY_SIZE(raw->name)); blk->ncoeff = le32_to_cpu(raw->ncoeff); + + pos = sizeof(*raw); break; default: - blk->id = cs_dsp_coeff_parse_int(sizeof(raw->id), data); - blk->name_len = cs_dsp_coeff_parse_string(sizeof(u8), data, + if (sizeof(raw->id) > data_len) + return -EOVERFLOW; + + tmp = region->data; + blk->id = cs_dsp_coeff_parse_int(sizeof(raw->id), &tmp); + pos = tmp - region->data; + + tmp = ®ion->data[pos]; + blk->name_len = cs_dsp_coeff_parse_string(sizeof(u8), &tmp, data_len - pos, &blk->name); - cs_dsp_coeff_parse_string(sizeof(u16), data, NULL); - blk->ncoeff = cs_dsp_coeff_parse_int(sizeof(raw->ncoeff), data); + if (!tmp) + return -EOVERFLOW; + + pos = tmp - region->data; + cs_dsp_coeff_parse_string(sizeof(u16), &tmp, data_len - pos, NULL); + if (!tmp) + return -EOVERFLOW; + + pos = tmp - region->data; + if (sizeof(raw->ncoeff) > (data_len - pos)) + return -EOVERFLOW; + + blk->ncoeff = cs_dsp_coeff_parse_int(sizeof(raw->ncoeff), &tmp); + pos += sizeof(raw->ncoeff); break; } + if ((int)blk->ncoeff < 0) + return -EOVERFLOW; + cs_dsp_dbg(dsp, "Algorithm ID: %#x\n", blk->id); cs_dsp_dbg(dsp, "Algorithm name: %.*s\n", blk->name_len, blk->name); cs_dsp_dbg(dsp, "# of coefficient descriptors: %#x\n", blk->ncoeff); + + return pos; } -static inline void cs_dsp_coeff_parse_coeff(struct cs_dsp *dsp, const u8 **data, - struct cs_dsp_coeff_parsed_coeff *blk) +static int cs_dsp_coeff_parse_coeff(struct cs_dsp *dsp, + const struct wmfw_region *region, + unsigned int pos, + struct cs_dsp_coeff_parsed_coeff *blk) { const struct wmfw_adsp_coeff_data *raw; + unsigned int data_len = le32_to_cpu(region->len); + unsigned int blk_len, blk_end_pos; const u8 *tmp; - int length; + + raw = (const struct wmfw_adsp_coeff_data *)®ion->data[pos]; + if (sizeof(raw->hdr) > (data_len - pos)) + return -EOVERFLOW; + + blk_len = le32_to_cpu(raw->hdr.size); + if (blk_len > S32_MAX) + return -EOVERFLOW; + + if (blk_len > (data_len - pos - sizeof(raw->hdr))) + return -EOVERFLOW; + + blk_end_pos = pos + sizeof(raw->hdr) + blk_len; + + blk->offset = le16_to_cpu(raw->hdr.offset); + blk->mem_type = le16_to_cpu(raw->hdr.type); switch (dsp->fw_ver) { case 0: case 1: - raw = (const struct wmfw_adsp_coeff_data *)*data; - *data = *data + sizeof(raw->hdr) + le32_to_cpu(raw->hdr.size); + if (sizeof(*raw) > (data_len - pos)) + return -EOVERFLOW; - blk->offset = le16_to_cpu(raw->hdr.offset); - blk->mem_type = le16_to_cpu(raw->hdr.type); blk->name = raw->name; - blk->name_len = strlen(raw->name); + blk->name_len = strnlen(raw->name, ARRAY_SIZE(raw->name)); blk->ctl_type = le16_to_cpu(raw->ctl_type); blk->flags = le16_to_cpu(raw->flags); blk->len = le32_to_cpu(raw->len); break; default: - tmp = *data; - blk->offset = cs_dsp_coeff_parse_int(sizeof(raw->hdr.offset), &tmp); - blk->mem_type = cs_dsp_coeff_parse_int(sizeof(raw->hdr.type), &tmp); - length = cs_dsp_coeff_parse_int(sizeof(raw->hdr.size), &tmp); - blk->name_len = cs_dsp_coeff_parse_string(sizeof(u8), &tmp, + pos += sizeof(raw->hdr); + tmp = ®ion->data[pos]; + blk->name_len = cs_dsp_coeff_parse_string(sizeof(u8), &tmp, data_len - pos, &blk->name); - cs_dsp_coeff_parse_string(sizeof(u8), &tmp, NULL); - cs_dsp_coeff_parse_string(sizeof(u16), &tmp, NULL); + if (!tmp) + return -EOVERFLOW; + + pos = tmp - region->data; + cs_dsp_coeff_parse_string(sizeof(u8), &tmp, data_len - pos, NULL); + if (!tmp) + return -EOVERFLOW; + + pos = tmp - region->data; + cs_dsp_coeff_parse_string(sizeof(u16), &tmp, data_len - pos, NULL); + if (!tmp) + return -EOVERFLOW; + + pos = tmp - region->data; + if (sizeof(raw->ctl_type) + sizeof(raw->flags) + sizeof(raw->len) > + (data_len - pos)) + return -EOVERFLOW; + blk->ctl_type = cs_dsp_coeff_parse_int(sizeof(raw->ctl_type), &tmp); + pos += sizeof(raw->ctl_type); blk->flags = cs_dsp_coeff_parse_int(sizeof(raw->flags), &tmp); + pos += sizeof(raw->flags); blk->len = cs_dsp_coeff_parse_int(sizeof(raw->len), &tmp); - - *data = *data + sizeof(raw->hdr) + length; break; } @@ -1224,6 +1300,8 @@ static inline void cs_dsp_coeff_parse_coeff(struct cs_dsp *dsp, const u8 **data, cs_dsp_dbg(dsp, "\tCoefficient flags: %#x\n", blk->flags); cs_dsp_dbg(dsp, "\tALSA control type: %#x\n", blk->ctl_type); cs_dsp_dbg(dsp, "\tALSA control len: %#x\n", blk->len); + + return blk_end_pos; } static int cs_dsp_check_coeff_flags(struct cs_dsp *dsp, @@ -1247,12 +1325,16 @@ static int cs_dsp_parse_coeff(struct cs_dsp *dsp, struct cs_dsp_alg_region alg_region = {}; struct cs_dsp_coeff_parsed_alg alg_blk; struct cs_dsp_coeff_parsed_coeff coeff_blk; - const u8 *data = region->data; - int i, ret; + int i, pos, ret; + + pos = cs_dsp_coeff_parse_alg(dsp, region, &alg_blk); + if (pos < 0) + return pos; - cs_dsp_coeff_parse_alg(dsp, &data, &alg_blk); for (i = 0; i < alg_blk.ncoeff; i++) { - cs_dsp_coeff_parse_coeff(dsp, &data, &coeff_blk); + pos = cs_dsp_coeff_parse_coeff(dsp, region, pos, &coeff_blk); + if (pos < 0) + return pos; switch (coeff_blk.ctl_type) { case WMFW_CTL_TYPE_BYTES: @@ -1321,6 +1403,10 @@ static unsigned int cs_dsp_adsp1_parse_sizes(struct cs_dsp *dsp, const struct wmfw_adsp1_sizes *adsp1_sizes; adsp1_sizes = (void *)&firmware->data[pos]; + if (sizeof(*adsp1_sizes) > firmware->size - pos) { + cs_dsp_err(dsp, "%s: file truncated\n", file); + return 0; + } cs_dsp_dbg(dsp, "%s: %d DM, %d PM, %d ZM\n", file, le32_to_cpu(adsp1_sizes->dm), le32_to_cpu(adsp1_sizes->pm), @@ -1337,6 +1423,10 @@ static unsigned int cs_dsp_adsp2_parse_sizes(struct cs_dsp *dsp, const struct wmfw_adsp2_sizes *adsp2_sizes; adsp2_sizes = (void *)&firmware->data[pos]; + if (sizeof(*adsp2_sizes) > firmware->size - pos) { + cs_dsp_err(dsp, "%s: file truncated\n", file); + return 0; + } cs_dsp_dbg(dsp, "%s: %d XM, %d YM %d PM, %d ZM\n", file, le32_to_cpu(adsp2_sizes->xm), le32_to_cpu(adsp2_sizes->ym), @@ -1376,7 +1466,6 @@ static int cs_dsp_load(struct cs_dsp *dsp, const struct firmware *firmware, struct regmap *regmap = dsp->regmap; unsigned int pos = 0; const struct wmfw_header *header; - const struct wmfw_adsp1_sizes *adsp1_sizes; const struct wmfw_footer *footer; const struct wmfw_region *region; const struct cs_dsp_region *mem; @@ -1392,10 +1481,8 @@ static int cs_dsp_load(struct cs_dsp *dsp, const struct firmware *firmware, ret = -EINVAL; - pos = sizeof(*header) + sizeof(*adsp1_sizes) + sizeof(*footer); - if (pos >= firmware->size) { - cs_dsp_err(dsp, "%s: file too short, %zu bytes\n", - file, firmware->size); + if (sizeof(*header) >= firmware->size) { + ret = -EOVERFLOW; goto out_fw; } @@ -1423,22 +1510,36 @@ static int cs_dsp_load(struct cs_dsp *dsp, const struct firmware *firmware, pos = sizeof(*header); pos = dsp->ops->parse_sizes(dsp, file, pos, firmware); + if ((pos == 0) || (sizeof(*footer) > firmware->size - pos)) { + ret = -EOVERFLOW; + goto out_fw; + } footer = (void *)&firmware->data[pos]; pos += sizeof(*footer); if (le32_to_cpu(header->len) != pos) { - cs_dsp_err(dsp, "%s: unexpected header length %d\n", - file, le32_to_cpu(header->len)); + ret = -EOVERFLOW; goto out_fw; } cs_dsp_dbg(dsp, "%s: timestamp %llu\n", file, le64_to_cpu(footer->timestamp)); - while (pos < firmware->size && - sizeof(*region) < firmware->size - pos) { + while (pos < firmware->size) { + /* Is there enough data for a complete block header? */ + if (sizeof(*region) > firmware->size - pos) { + ret = -EOVERFLOW; + goto out_fw; + } + region = (void *)&(firmware->data[pos]); + + if (le32_to_cpu(region->len) > firmware->size - pos - sizeof(*region)) { + ret = -EOVERFLOW; + goto out_fw; + } + region_name = "Unknown"; reg = 0; text = NULL; @@ -1495,16 +1596,6 @@ static int cs_dsp_load(struct cs_dsp *dsp, const struct firmware *firmware, regions, le32_to_cpu(region->len), offset, region_name); - if (le32_to_cpu(region->len) > - firmware->size - pos - sizeof(*region)) { - cs_dsp_err(dsp, - "%s.%d: %s region len %d bytes exceeds file length %zu\n", - file, regions, region_name, - le32_to_cpu(region->len), firmware->size); - ret = -EINVAL; - goto out_fw; - } - if (text) { memcpy(text, region->data, le32_to_cpu(region->len)); cs_dsp_info(dsp, "%s: %s\n", file, text); @@ -1555,6 +1646,9 @@ out_fw: cs_dsp_buf_free(&buf_list); kfree(text); + if (ret == -EOVERFLOW) + cs_dsp_err(dsp, "%s: file content overflows file data\n", file); + return ret; } @@ -2122,10 +2216,20 @@ static int cs_dsp_load_coeff(struct cs_dsp *dsp, const struct firmware *firmware pos = le32_to_cpu(hdr->len); blocks = 0; - while (pos < firmware->size && - sizeof(*blk) < firmware->size - pos) { + while (pos < firmware->size) { + /* Is there enough data for a complete block header? */ + if (sizeof(*blk) > firmware->size - pos) { + ret = -EOVERFLOW; + goto out_fw; + } + blk = (void *)(&firmware->data[pos]); + if (le32_to_cpu(blk->len) > firmware->size - pos - sizeof(*blk)) { + ret = -EOVERFLOW; + goto out_fw; + } + type = le16_to_cpu(blk->type); offset = le16_to_cpu(blk->offset); version = le32_to_cpu(blk->ver) >> 8; @@ -2222,17 +2326,6 @@ static int cs_dsp_load_coeff(struct cs_dsp *dsp, const struct firmware *firmware } if (reg) { - if (le32_to_cpu(blk->len) > - firmware->size - pos - sizeof(*blk)) { - cs_dsp_err(dsp, - "%s.%d: %s region len %d bytes exceeds file length %zu\n", - file, blocks, region_name, - le32_to_cpu(blk->len), - firmware->size); - ret = -EINVAL; - goto out_fw; - } - buf = cs_dsp_buf_alloc(blk->data, le32_to_cpu(blk->len), &buf_list); @@ -2272,6 +2365,10 @@ out_fw: regmap_async_complete(regmap); cs_dsp_buf_free(&buf_list); kfree(text); + + if (ret == -EOVERFLOW) + cs_dsp_err(dsp, "%s: file content overflows file data\n", file); + return ret; } diff --git a/drivers/firmware/sysfb.c b/drivers/firmware/sysfb.c index 880ffcb50088..921f61507ae8 100644 --- a/drivers/firmware/sysfb.c +++ b/drivers/firmware/sysfb.c @@ -101,8 +101,10 @@ static __init struct device *sysfb_parent_dev(const struct screen_info *si) if (IS_ERR(pdev)) { return ERR_CAST(pdev); } else if (pdev) { - if (!sysfb_pci_dev_is_enabled(pdev)) + if (!sysfb_pci_dev_is_enabled(pdev)) { + pci_dev_put(pdev); return ERR_PTR(-ENODEV); + } return &pdev->dev; } @@ -137,7 +139,7 @@ static __init int sysfb_init(void) if (compatible) { pd = sysfb_create_simplefb(si, &mode, parent); if (!IS_ERR(pd)) - goto unlock_mutex; + goto put_device; } /* if the FB is incompatible, create a legacy framebuffer device */ @@ -155,7 +157,7 @@ static __init int sysfb_init(void) pd = platform_device_alloc(name, 0); if (!pd) { ret = -ENOMEM; - goto unlock_mutex; + goto put_device; } pd->dev.parent = parent; @@ -170,9 +172,11 @@ static __init int sysfb_init(void) if (ret) goto err; - goto unlock_mutex; + goto put_device; err: platform_device_put(pd); +put_device: + put_device(parent); unlock_mutex: mutex_unlock(&disable_lock); return ret; diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index 71e1af7c2184..d89e78f0ead3 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -619,8 +619,6 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev, ret = gpiochip_get_ngpios(gc, dev); if (ret) gc->ngpio = gc->bgpio_bits; - else - gc->bgpio_bits = roundup_pow_of_two(round_up(gc->ngpio, 8)); ret = bgpio_setup_io(gc, dat, set, clr, flags); if (ret) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index d75f6ee37028..89d5e64cf68b 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -203,6 +203,24 @@ static void of_gpio_try_fixup_polarity(const struct device_node *np, */ { "qi,lb60", "rb-gpios", true }, #endif +#if IS_ENABLED(CONFIG_PCI_LANTIQ) + /* + * According to the PCI specification, the RST# pin is an + * active-low signal. However, most of the device trees that + * have been widely used for a long time incorrectly describe + * reset GPIO as active-high, and were also using wrong name + * for the property. + */ + { "lantiq,pci-xway", "gpio-reset", false }, +#endif +#if IS_ENABLED(CONFIG_TOUCHSCREEN_TSC2005) + /* + * DTS for Nokia N900 incorrectly specified "active high" + * polarity for the reset line, while the chip actually + * treats it as "active low". + */ + { "ti,tsc2005", "reset-gpios", false }, +#endif }; unsigned int i; @@ -504,9 +522,9 @@ static struct gpio_desc *of_find_gpio_rename(struct device_node *np, { "reset", "reset-n-io", "marvell,nfc-uart" }, { "reset", "reset-n-io", "mrvl,nfc-uart" }, #endif -#if !IS_ENABLED(CONFIG_PCI_LANTIQ) +#if IS_ENABLED(CONFIG_PCI_LANTIQ) /* MIPS Lantiq PCI */ - { "reset", "gpios-reset", "lantiq,pci-xway" }, + { "reset", "gpio-reset", "lantiq,pci-xway" }, #endif /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index ec888fc6ead8..916b6b8cf7d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1093,6 +1093,21 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) unsigned int i; int r; + /* + * We can't use gang submit on with reserved VMIDs when the VM changes + * can't be invalidated by more than one engine at the same time. + */ + if (p->gang_size > 1 && !p->adev->vm_manager.concurrent_flush) { + for (i = 0; i < p->gang_size; ++i) { + struct drm_sched_entity *entity = p->entities[i]; + struct drm_gpu_scheduler *sched = entity->rq->sched; + struct amdgpu_ring *ring = to_amdgpu_ring(sched); + + if (amdgpu_vmid_uses_reserved(vm, ring->vm_hub)) + return -EINVAL; + } + } + r = amdgpu_vm_clear_freed(adev, vm, NULL); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index 3d7fcdeaf8cf..e8f6e4dbc5a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -406,7 +406,7 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring, if (r || !idle) goto error; - if (vm->reserved_vmid[vmhub] || (enforce_isolation && (vmhub == AMDGPU_GFXHUB(0)))) { + if (amdgpu_vmid_uses_reserved(vm, vmhub)) { r = amdgpu_vmid_grab_reserved(vm, ring, job, &id, fence); if (r || !id) goto error; @@ -456,6 +456,19 @@ error: return r; } +/* + * amdgpu_vmid_uses_reserved - check if a VM will use a reserved VMID + * @vm: the VM to check + * @vmhub: the VMHUB which will be used + * + * Returns: True if the VM will use a reserved VMID. + */ +bool amdgpu_vmid_uses_reserved(struct amdgpu_vm *vm, unsigned int vmhub) +{ + return vm->reserved_vmid[vmhub] || + (enforce_isolation && (vmhub == AMDGPU_GFXHUB(0))); +} + int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev, unsigned vmhub) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h index fa8c42c83d5d..240fa6751260 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h @@ -78,6 +78,7 @@ void amdgpu_pasid_free_delayed(struct dma_resv *resv, bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev, struct amdgpu_vmid *id); +bool amdgpu_vmid_uses_reserved(struct amdgpu_vm *vm, unsigned int vmhub); int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev, unsigned vmhub); void amdgpu_vmid_free_reserved(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index e9ac20bed0f2..a622aca8c649 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -11181,6 +11181,49 @@ static bool parse_edid_cea(struct amdgpu_dm_connector *aconnector, return ret; } +static void parse_edid_displayid_vrr(struct drm_connector *connector, + struct edid *edid) +{ + u8 *edid_ext = NULL; + int i; + int j = 0; + u16 min_vfreq; + u16 max_vfreq; + + if (edid == NULL || edid->extensions == 0) + return; + + /* Find DisplayID extension */ + for (i = 0; i < edid->extensions; i++) { + edid_ext = (void *)(edid + (i + 1)); + if (edid_ext[0] == DISPLAYID_EXT) + break; + } + + if (edid_ext == NULL) + return; + + while (j < EDID_LENGTH) { + /* Get dynamic video timing range from DisplayID if available */ + if (EDID_LENGTH - j > 13 && edid_ext[j] == 0x25 && + (edid_ext[j+1] & 0xFE) == 0 && (edid_ext[j+2] == 9)) { + min_vfreq = edid_ext[j+9]; + if (edid_ext[j+1] & 7) + max_vfreq = edid_ext[j+10] + ((edid_ext[j+11] & 3) << 8); + else + max_vfreq = edid_ext[j+10]; + + if (max_vfreq && min_vfreq) { + connector->display_info.monitor_range.max_vfreq = max_vfreq; + connector->display_info.monitor_range.min_vfreq = min_vfreq; + + return; + } + } + j++; + } +} + static int parse_amd_vsdb(struct amdgpu_dm_connector *aconnector, struct edid *edid, struct amdgpu_hdmi_vsdb_info *vsdb_info) { @@ -11302,6 +11345,11 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector, if (!adev->dm.freesync_module) goto update; + /* Some eDP panels only have the refresh rate range info in DisplayID */ + if ((connector->display_info.monitor_range.min_vfreq == 0 || + connector->display_info.monitor_range.max_vfreq == 0)) + parse_edid_displayid_vrr(connector, edid); + if (edid && (sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT || sink->sink_signal == SIGNAL_TYPE_EDP)) { bool edid_check_required = false; @@ -11309,9 +11357,11 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector, if (is_dp_capable_without_timing_msa(adev->dm.dc, amdgpu_dm_connector)) { if (edid->features & DRM_EDID_FEATURE_CONTINUOUS_FREQ) { - freesync_capable = true; amdgpu_dm_connector->min_vfreq = connector->display_info.monitor_range.min_vfreq; amdgpu_dm_connector->max_vfreq = connector->display_info.monitor_range.max_vfreq; + if (amdgpu_dm_connector->max_vfreq - + amdgpu_dm_connector->min_vfreq > 10) + freesync_capable = true; } else { edid_check_required = edid->version > 1 || (edid->version == 1 && diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c index 6c84b0fa40f4..0782a34689a0 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/display_mode_vba_32.c @@ -3364,6 +3364,9 @@ void dml32_ModeSupportAndSystemConfigurationFull(struct display_mode_lib *mode_l &mode_lib->vba.UrgentBurstFactorLumaPre[k], &mode_lib->vba.UrgentBurstFactorChromaPre[k], &mode_lib->vba.NotUrgentLatencyHidingPre[k]); + + v->cursor_bw_pre[k] = mode_lib->vba.NumberOfCursors[k] * mode_lib->vba.CursorWidth[k][0] * mode_lib->vba.CursorBPP[k][0] / + 8.0 / (mode_lib->vba.HTotal[k] / mode_lib->vba.PixelClock[k]) * v->VRatioPreY[i][j][k]; } { diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c index a41812598ce8..8ecc972dbffd 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c @@ -234,6 +234,7 @@ void dml2_init_socbb_params(struct dml2_context *dml2, const struct dc *in_dc, s out->round_trip_ping_latency_dcfclk_cycles = 106; out->smn_latency_us = 2; out->dispclk_dppclk_vco_speed_mhz = 3600; + out->pct_ideal_dram_bw_after_urgent_pixel_only = 65.0; break; } diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_utils.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_utils.c index 0f8b3336e26d..cbd1c1f26b7a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_utils.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_utils.c @@ -294,7 +294,7 @@ void dml2_calculate_rq_and_dlg_params(const struct dc *dc, struct dc_state *cont context->bw_ctx.bw.dcn.clk.dcfclk_deep_sleep_khz = (unsigned int)in_ctx->v20.dml_core_ctx.mp.DCFCLKDeepSleep * 1000; context->bw_ctx.bw.dcn.clk.dppclk_khz = 0; - if (in_ctx->v20.dml_core_ctx.ms.support.FCLKChangeSupport[in_ctx->v20.scratch.mode_support_params.out_lowest_state_idx] == dml_fclock_change_unsupported) + if (in_ctx->v20.dml_core_ctx.ms.support.FCLKChangeSupport[0] == dml_fclock_change_unsupported) context->bw_ctx.bw.dcn.clk.fclk_p_state_change_support = false; else context->bw_ctx.bw.dcn.clk.fclk_p_state_change_support = true; diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h b/drivers/gpu/drm/amd/include/atomfirmware.h index 571691837200..09cbc3afd6d8 100644 --- a/drivers/gpu/drm/amd/include/atomfirmware.h +++ b/drivers/gpu/drm/amd/include/atomfirmware.h @@ -734,7 +734,7 @@ struct atom_gpio_pin_lut_v2_1 { struct atom_common_table_header table_header; /*the real number of this included in the structure is calcualted by using the (whole structure size - the header size)/size of atom_gpio_pin_lut */ - struct atom_gpio_pin_assignment gpio_pin[8]; + struct atom_gpio_pin_assignment gpio_pin[]; }; diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511.h b/drivers/gpu/drm/bridge/adv7511/adv7511.h index ea271f62b214..ec0b7f3d889c 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511.h +++ b/drivers/gpu/drm/bridge/adv7511/adv7511.h @@ -401,7 +401,7 @@ struct adv7511 { #ifdef CONFIG_DRM_I2C_ADV7511_CEC int adv7511_cec_init(struct device *dev, struct adv7511 *adv7511); -void adv7511_cec_irq_process(struct adv7511 *adv7511, unsigned int irq1); +int adv7511_cec_irq_process(struct adv7511 *adv7511, unsigned int irq1); #else static inline int adv7511_cec_init(struct device *dev, struct adv7511 *adv7511) { diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_cec.c b/drivers/gpu/drm/bridge/adv7511/adv7511_cec.c index 44451a9658a3..2e9c88a2b5ed 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_cec.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_cec.c @@ -119,7 +119,7 @@ static void adv7511_cec_rx(struct adv7511 *adv7511, int rx_buf) cec_received_msg(adv7511->cec_adap, &msg); } -void adv7511_cec_irq_process(struct adv7511 *adv7511, unsigned int irq1) +int adv7511_cec_irq_process(struct adv7511 *adv7511, unsigned int irq1) { unsigned int offset = adv7511->info->reg_cec_offset; const u32 irq_tx_mask = ADV7511_INT1_CEC_TX_READY | @@ -131,16 +131,19 @@ void adv7511_cec_irq_process(struct adv7511 *adv7511, unsigned int irq1) unsigned int rx_status; int rx_order[3] = { -1, -1, -1 }; int i; + int irq_status = IRQ_NONE; - if (irq1 & irq_tx_mask) + if (irq1 & irq_tx_mask) { adv_cec_tx_raw_status(adv7511, irq1); + irq_status = IRQ_HANDLED; + } if (!(irq1 & irq_rx_mask)) - return; + return irq_status; if (regmap_read(adv7511->regmap_cec, ADV7511_REG_CEC_RX_STATUS + offset, &rx_status)) - return; + return irq_status; /* * ADV7511_REG_CEC_RX_STATUS[5:0] contains the reception order of RX @@ -172,6 +175,8 @@ void adv7511_cec_irq_process(struct adv7511 *adv7511, unsigned int irq1) adv7511_cec_rx(adv7511, rx_buf); } + + return IRQ_HANDLED; } static int adv7511_cec_adap_enable(struct cec_adapter *adap, bool enable) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index 66ccb61e2a66..c8d2c4a157b2 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -469,6 +469,8 @@ static int adv7511_irq_process(struct adv7511 *adv7511, bool process_hpd) { unsigned int irq0, irq1; int ret; + int cec_status = IRQ_NONE; + int irq_status = IRQ_NONE; ret = regmap_read(adv7511->regmap, ADV7511_REG_INT(0), &irq0); if (ret < 0) @@ -478,29 +480,31 @@ static int adv7511_irq_process(struct adv7511 *adv7511, bool process_hpd) if (ret < 0) return ret; - /* If there is no IRQ to handle, exit indicating no IRQ data */ - if (!(irq0 & (ADV7511_INT0_HPD | ADV7511_INT0_EDID_READY)) && - !(irq1 & ADV7511_INT1_DDC_ERROR)) - return -ENODATA; - regmap_write(adv7511->regmap, ADV7511_REG_INT(0), irq0); regmap_write(adv7511->regmap, ADV7511_REG_INT(1), irq1); - if (process_hpd && irq0 & ADV7511_INT0_HPD && adv7511->bridge.encoder) + if (process_hpd && irq0 & ADV7511_INT0_HPD && adv7511->bridge.encoder) { schedule_work(&adv7511->hpd_work); + irq_status = IRQ_HANDLED; + } if (irq0 & ADV7511_INT0_EDID_READY || irq1 & ADV7511_INT1_DDC_ERROR) { adv7511->edid_read = true; if (adv7511->i2c_main->irq) wake_up_all(&adv7511->wq); + irq_status = IRQ_HANDLED; } #ifdef CONFIG_DRM_I2C_ADV7511_CEC - adv7511_cec_irq_process(adv7511, irq1); + cec_status = adv7511_cec_irq_process(adv7511, irq1); #endif - return 0; + /* If there is no IRQ to handle, exit indicating no IRQ data */ + if (irq_status == IRQ_HANDLED || cec_status == IRQ_HANDLED) + return IRQ_HANDLED; + + return IRQ_NONE; } static irqreturn_t adv7511_irq_handler(int irq, void *devid) @@ -509,7 +513,7 @@ static irqreturn_t adv7511_irq_handler(int irq, void *devid) int ret; ret = adv7511_irq_process(adv7511, true); - return ret < 0 ? IRQ_NONE : IRQ_HANDLED; + return ret < 0 ? IRQ_NONE : ret; } /* ----------------------------------------------------------------------------- diff --git a/drivers/gpu/drm/drm_fbdev_generic.c b/drivers/gpu/drm/drm_fbdev_generic.c index 97e579c33d84..1e200d815e1a 100644 --- a/drivers/gpu/drm/drm_fbdev_generic.c +++ b/drivers/gpu/drm/drm_fbdev_generic.c @@ -84,7 +84,8 @@ static int drm_fbdev_generic_helper_fb_probe(struct drm_fb_helper *fb_helper, sizes->surface_width, sizes->surface_height, sizes->surface_bpp); - format = drm_mode_legacy_fb_format(sizes->surface_bpp, sizes->surface_depth); + format = drm_driver_legacy_fb_format(dev, sizes->surface_bpp, + sizes->surface_depth); buffer = drm_client_framebuffer_create(client, sizes->surface_width, sizes->surface_height, format); if (IS_ERR(buffer)) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index 2166208a961d..3860a8ce1e2d 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -420,13 +420,20 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Galaxy Book 10.6"), }, .driver_data = (void *)&lcd1280x1920_rightside_up, - }, { /* Valve Steam Deck */ + }, { /* Valve Steam Deck (Jupiter) */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Valve"), DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Jupiter"), DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "1"), }, .driver_data = (void *)&lcd800x1280_rightside_up, + }, { /* Valve Steam Deck (Galileo) */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Valve"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Galileo"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "1"), + }, + .driver_data = (void *)&lcd800x1280_rightside_up, }, { /* VIOS LTH17 */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "VIOS"), diff --git a/drivers/gpu/drm/gma500/cdv_intel_lvds.c b/drivers/gpu/drm/gma500/cdv_intel_lvds.c index f08a6803dc18..3adc2c9ab72d 100644 --- a/drivers/gpu/drm/gma500/cdv_intel_lvds.c +++ b/drivers/gpu/drm/gma500/cdv_intel_lvds.c @@ -311,6 +311,9 @@ static int cdv_intel_lvds_get_modes(struct drm_connector *connector) if (mode_dev->panel_fixed_mode != NULL) { struct drm_display_mode *mode = drm_mode_duplicate(dev, mode_dev->panel_fixed_mode); + if (!mode) + return 0; + drm_mode_probed_add(connector, mode); return 1; } diff --git a/drivers/gpu/drm/gma500/psb_intel_lvds.c b/drivers/gpu/drm/gma500/psb_intel_lvds.c index 8486de230ec9..8d1be94a443b 100644 --- a/drivers/gpu/drm/gma500/psb_intel_lvds.c +++ b/drivers/gpu/drm/gma500/psb_intel_lvds.c @@ -504,6 +504,9 @@ static int psb_intel_lvds_get_modes(struct drm_connector *connector) if (mode_dev->panel_fixed_mode != NULL) { struct drm_display_mode *mode = drm_mode_duplicate(dev, mode_dev->panel_fixed_mode); + if (!mode) + return 0; + drm_mode_probed_add(connector, mode); return 1; } diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 3c3fc53376ce..6bff169fa8d4 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -2088,6 +2088,9 @@ icl_program_mg_dp_mode(struct intel_digital_port *dig_port, u32 ln0, ln1, pin_assignment; u8 width; + if (DISPLAY_VER(dev_priv) >= 14) + return; + if (!intel_encoder_is_tc(&dig_port->base) || intel_tc_port_in_tbt_alt_mode(dig_port)) return; diff --git a/drivers/gpu/drm/meson/meson_drv.c b/drivers/gpu/drm/meson/meson_drv.c index 17a5cca007e2..4bd0baa2a4f5 100644 --- a/drivers/gpu/drm/meson/meson_drv.c +++ b/drivers/gpu/drm/meson/meson_drv.c @@ -250,29 +250,20 @@ static int meson_drv_bind_master(struct device *dev, bool has_components) if (ret) goto free_drm; ret = meson_canvas_alloc(priv->canvas, &priv->canvas_id_vd1_0); - if (ret) { - meson_canvas_free(priv->canvas, priv->canvas_id_osd1); - goto free_drm; - } + if (ret) + goto free_canvas_osd1; ret = meson_canvas_alloc(priv->canvas, &priv->canvas_id_vd1_1); - if (ret) { - meson_canvas_free(priv->canvas, priv->canvas_id_osd1); - meson_canvas_free(priv->canvas, priv->canvas_id_vd1_0); - goto free_drm; - } + if (ret) + goto free_canvas_vd1_0; ret = meson_canvas_alloc(priv->canvas, &priv->canvas_id_vd1_2); - if (ret) { - meson_canvas_free(priv->canvas, priv->canvas_id_osd1); - meson_canvas_free(priv->canvas, priv->canvas_id_vd1_0); - meson_canvas_free(priv->canvas, priv->canvas_id_vd1_1); - goto free_drm; - } + if (ret) + goto free_canvas_vd1_1; priv->vsync_irq = platform_get_irq(pdev, 0); ret = drm_vblank_init(drm, 1); if (ret) - goto free_drm; + goto free_canvas_vd1_2; /* Assign limits per soc revision/package */ for (i = 0 ; i < ARRAY_SIZE(meson_drm_soc_attrs) ; ++i) { @@ -288,11 +279,11 @@ static int meson_drv_bind_master(struct device *dev, bool has_components) */ ret = drm_aperture_remove_framebuffers(&meson_driver); if (ret) - goto free_drm; + goto free_canvas_vd1_2; ret = drmm_mode_config_init(drm); if (ret) - goto free_drm; + goto free_canvas_vd1_2; drm->mode_config.max_width = 3840; drm->mode_config.max_height = 2160; drm->mode_config.funcs = &meson_mode_config_funcs; @@ -307,7 +298,7 @@ static int meson_drv_bind_master(struct device *dev, bool has_components) if (priv->afbcd.ops) { ret = priv->afbcd.ops->init(priv); if (ret) - goto free_drm; + goto free_canvas_vd1_2; } /* Encoder Initialization */ @@ -371,6 +362,14 @@ uninstall_irq: exit_afbcd: if (priv->afbcd.ops) priv->afbcd.ops->exit(priv); +free_canvas_vd1_2: + meson_canvas_free(priv->canvas, priv->canvas_id_vd1_2); +free_canvas_vd1_1: + meson_canvas_free(priv->canvas, priv->canvas_id_vd1_1); +free_canvas_vd1_0: + meson_canvas_free(priv->canvas, priv->canvas_id_vd1_0); +free_canvas_osd1: + meson_canvas_free(priv->canvas, priv->canvas_id_osd1); free_drm: drm_dev_put(drm); diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index 856b3ef5edb8..0c71d761d378 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -1001,6 +1001,9 @@ nouveau_connector_get_modes(struct drm_connector *connector) struct drm_display_mode *mode; mode = drm_mode_duplicate(dev, nv_connector->native_mode); + if (!mode) + return 0; + drm_mode_probed_add(connector, mode); ret = 1; } diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index b8a84f26b3ef..b5e7b919f241 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -86,15 +86,15 @@ panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride, int ret = 0; void *out_alloc; + if (!in->count) + return NULL; + /* User stride must be at least the minimum object size, otherwise it might * lack useful information. */ if (in->stride < min_stride) return ERR_PTR(-EINVAL); - if (!in->count) - return NULL; - out_alloc = kvmalloc_array(in->count, obj_size, GFP_KERNEL); if (!out_alloc) return ERR_PTR(-ENOMEM); diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 79ffcbc41d78..9a0ff48f7061 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -459,6 +459,16 @@ struct panthor_queue { atomic64_t seqno; /** + * @last_fence: Fence of the last submitted job. + * + * We return this fence when we get an empty command stream. + * This way, we are guaranteed that all earlier jobs have completed + * when drm_sched_job::s_fence::finished without having to feed + * the CS ring buffer with a dummy job that only signals the fence. + */ + struct dma_fence *last_fence; + + /** * @in_flight_jobs: List containing all in-flight jobs. * * Used to keep track and signal panthor_job::done_fence when the @@ -829,6 +839,9 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue * panthor_kernel_bo_destroy(queue->ringbuf); panthor_kernel_bo_destroy(queue->iface.mem); + /* Release the last_fence we were holding, if any. */ + dma_fence_put(queue->fence_ctx.last_fence); + kfree(queue); } @@ -2784,9 +2797,6 @@ static void group_sync_upd_work(struct work_struct *work) spin_lock(&queue->fence_ctx.lock); list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) { - if (!job->call_info.size) - continue; - if (syncobj->seqno < job->done_fence->seqno) break; @@ -2865,11 +2875,14 @@ queue_run_job(struct drm_sched_job *sched_job) static_assert(sizeof(call_instrs) % 64 == 0, "call_instrs is not aligned on a cacheline"); - /* Stream size is zero, nothing to do => return a NULL fence and let - * drm_sched signal the parent. + /* Stream size is zero, nothing to do except making sure all previously + * submitted jobs are done before we signal the + * drm_sched_job::s_fence::finished fence. */ - if (!job->call_info.size) - return NULL; + if (!job->call_info.size) { + job->done_fence = dma_fence_get(queue->fence_ctx.last_fence); + return dma_fence_get(job->done_fence); + } ret = pm_runtime_resume_and_get(ptdev->base.dev); if (drm_WARN_ON(&ptdev->base, ret)) @@ -2928,6 +2941,10 @@ queue_run_job(struct drm_sched_job *sched_job) } } + /* Update the last fence. */ + dma_fence_put(queue->fence_ctx.last_fence); + queue->fence_ctx.last_fence = dma_fence_get(job->done_fence); + done_fence = dma_fence_get(job->done_fence); out_unlock: @@ -3378,10 +3395,15 @@ panthor_job_create(struct panthor_file *pfile, goto err_put_job; } - job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL); - if (!job->done_fence) { - ret = -ENOMEM; - goto err_put_job; + /* Empty command streams don't need a fence, they'll pick the one from + * the previously submitted job. + */ + if (job->call_info.size) { + job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL); + if (!job->done_fence) { + ret = -ENOMEM; + goto err_put_job; + } } ret = drm_sched_job_init(&job->base, diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 2ef201a072f1..e66a230331ee 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -642,7 +642,7 @@ static void radeon_gem_va_update_vm(struct radeon_device *rdev, if (r) goto error_unlock; - if (bo_va->it.start) + if (bo_va->it.start && bo_va->bo) r = radeon_vm_bo_update(rdev, bo_va, bo_va->bo->tbo.resource); error_unlock: diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 6396dece0db1..2427be8bc97f 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -346,6 +346,7 @@ static void ttm_bo_release(struct kref *kref) if (!dma_resv_test_signaled(bo->base.resv, DMA_RESV_USAGE_BOOKKEEP) || (want_init_on_free() && (bo->ttm != NULL)) || + bo->type == ttm_bo_type_sg || !dma_resv_trylock(bo->base.resv)) { /* The BO is not idle, resurrect it for delayed destroy */ ttm_bo_flush_all_fences(bo); diff --git a/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c b/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c index d46f87a039f2..b3d3c065dd9d 100644 --- a/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c +++ b/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c @@ -159,12 +159,16 @@ void intel_hdcp_gsc_fini(struct xe_device *xe) { struct intel_hdcp_gsc_message *hdcp_message = xe->display.hdcp.hdcp_message; + struct i915_hdcp_arbiter *arb = xe->display.hdcp.arbiter; - if (!hdcp_message) - return; + if (hdcp_message) { + xe_bo_unpin_map_no_vm(hdcp_message->hdcp_bo); + kfree(hdcp_message); + xe->display.hdcp.hdcp_message = NULL; + } - xe_bo_unpin_map_no_vm(hdcp_message->hdcp_bo); - kfree(hdcp_message); + kfree(arb); + xe->display.hdcp.arbiter = NULL; } static int xe_gsc_send_sync(struct xe_device *xe, diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index bc1f794e3e61..b6f3a43d637f 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -317,7 +317,7 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo, struct xe_device *xe = xe_bo_device(bo); struct xe_ttm_tt *tt; unsigned long extra_pages; - enum ttm_caching caching; + enum ttm_caching caching = ttm_cached; int err; tt = kzalloc(sizeof(*tt), GFP_KERNEL); @@ -331,26 +331,35 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo, extra_pages = DIV_ROUND_UP(xe_device_ccs_bytes(xe, bo->size), PAGE_SIZE); - switch (bo->cpu_caching) { - case DRM_XE_GEM_CPU_CACHING_WC: - caching = ttm_write_combined; - break; - default: - caching = ttm_cached; - break; - } - - WARN_ON((bo->flags & XE_BO_FLAG_USER) && !bo->cpu_caching); - /* - * Display scanout is always non-coherent with the CPU cache. - * - * For Xe_LPG and beyond, PPGTT PTE lookups are also non-coherent and - * require a CPU:WC mapping. + * DGFX system memory is always WB / ttm_cached, since + * other caching modes are only supported on x86. DGFX + * GPU system memory accesses are always coherent with the + * CPU. */ - if ((!bo->cpu_caching && bo->flags & XE_BO_FLAG_SCANOUT) || - (xe->info.graphics_verx100 >= 1270 && bo->flags & XE_BO_FLAG_PAGETABLE)) - caching = ttm_write_combined; + if (!IS_DGFX(xe)) { + switch (bo->cpu_caching) { + case DRM_XE_GEM_CPU_CACHING_WC: + caching = ttm_write_combined; + break; + default: + caching = ttm_cached; + break; + } + + WARN_ON((bo->flags & XE_BO_FLAG_USER) && !bo->cpu_caching); + + /* + * Display scanout is always non-coherent with the CPU cache. + * + * For Xe_LPG and beyond, PPGTT PTE lookups are also + * non-coherent and require a CPU:WC mapping. + */ + if ((!bo->cpu_caching && bo->flags & XE_BO_FLAG_SCANOUT) || + (xe->info.graphics_verx100 >= 1270 && + bo->flags & XE_BO_FLAG_PAGETABLE)) + caching = ttm_write_combined; + } err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages); if (err) { diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h index 86422e113d39..10450f1fbbde 100644 --- a/drivers/gpu/drm/xe/xe_bo_types.h +++ b/drivers/gpu/drm/xe/xe_bo_types.h @@ -66,7 +66,8 @@ struct xe_bo { /** * @cpu_caching: CPU caching mode. Currently only used for userspace - * objects. + * objects. Exceptions are system memory on DGFX, which is always + * WB. */ u16 cpu_caching; diff --git a/drivers/gpu/drm/xe/xe_gt_mcr.c b/drivers/gpu/drm/xe/xe_gt_mcr.c index 577bd7043740..0443e07880a0 100644 --- a/drivers/gpu/drm/xe/xe_gt_mcr.c +++ b/drivers/gpu/drm/xe/xe_gt_mcr.c @@ -342,7 +342,7 @@ static void init_steering_oaddrm(struct xe_gt *gt) else gt->steering[OADDRM].group_target = 1; - gt->steering[DSS].instance_target = 0; /* unused */ + gt->steering[OADDRM].instance_target = 0; /* unused */ } static void init_steering_sqidi_psmi(struct xe_gt *gt) @@ -357,8 +357,8 @@ static void init_steering_sqidi_psmi(struct xe_gt *gt) static void init_steering_inst0(struct xe_gt *gt) { - gt->steering[DSS].group_target = 0; /* unused */ - gt->steering[DSS].instance_target = 0; /* unused */ + gt->steering[INSTANCE0].group_target = 0; /* unused */ + gt->steering[INSTANCE0].instance_target = 0; /* unused */ } static const struct { diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 65e5a3f4c340..198f5c2189cb 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1334,7 +1334,7 @@ xe_migrate_update_pgtables(struct xe_migrate *m, GFP_KERNEL, true, 0); if (IS_ERR(sa_bo)) { err = PTR_ERR(sa_bo); - goto err; + goto err_bb; } ppgtt_ofs = NUM_KERNEL_PDE + @@ -1385,7 +1385,7 @@ xe_migrate_update_pgtables(struct xe_migrate *m, update_idx); if (IS_ERR(job)) { err = PTR_ERR(job); - goto err_bb; + goto err_sa; } /* Wait on BO move */ @@ -1434,12 +1434,12 @@ xe_migrate_update_pgtables(struct xe_migrate *m, err_job: xe_sched_job_put(job); +err_sa: + drm_suballoc_free(sa_bo, NULL); err_bb: if (!q) mutex_unlock(&m->job_mutex); xe_bb_free(bb, NULL); -err: - drm_suballoc_free(sa_bo, NULL); return ERR_PTR(err); } diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig index 9388823bb0bb..44710267d669 100644 --- a/drivers/i2c/Kconfig +++ b/drivers/i2c/Kconfig @@ -135,7 +135,7 @@ config I2C_SLAVE_EEPROM Documentation/i2c/slave-eeprom-backend.rst for further details. config I2C_SLAVE_TESTUNIT - tristate "I2C eeprom testunit driver" + tristate "I2C testunit driver" help This backend can be used to trigger test cases for I2C bus masters which require a remote device with certain capabilities, e.g. diff --git a/drivers/i2c/busses/i2c-pnx.c b/drivers/i2c/busses/i2c-pnx.c index a12525b3186b..f448505d5468 100644 --- a/drivers/i2c/busses/i2c-pnx.c +++ b/drivers/i2c/busses/i2c-pnx.c @@ -15,7 +15,6 @@ #include <linux/ioport.h> #include <linux/delay.h> #include <linux/i2c.h> -#include <linux/timer.h> #include <linux/completion.h> #include <linux/platform_device.h> #include <linux/io.h> @@ -32,7 +31,6 @@ struct i2c_pnx_mif { int ret; /* Return value */ int mode; /* Interface mode */ struct completion complete; /* I/O completion */ - struct timer_list timer; /* Timeout */ u8 * buf; /* Data buffer */ int len; /* Length of data buffer */ int order; /* RX Bytes to order via TX */ @@ -117,24 +115,6 @@ static inline int wait_reset(struct i2c_pnx_algo_data *data) return (timeout <= 0); } -static inline void i2c_pnx_arm_timer(struct i2c_pnx_algo_data *alg_data) -{ - struct timer_list *timer = &alg_data->mif.timer; - unsigned long expires = msecs_to_jiffies(alg_data->timeout); - - if (expires <= 1) - expires = 2; - - del_timer_sync(timer); - - dev_dbg(&alg_data->adapter.dev, "Timer armed at %lu plus %lu jiffies.\n", - jiffies, expires); - - timer->expires = jiffies + expires; - - add_timer(timer); -} - /** * i2c_pnx_start - start a device * @slave_addr: slave address @@ -259,8 +239,6 @@ static int i2c_pnx_master_xmit(struct i2c_pnx_algo_data *alg_data) ~(mcntrl_afie | mcntrl_naie | mcntrl_drmie), I2C_REG_CTL(alg_data)); - del_timer_sync(&alg_data->mif.timer); - dev_dbg(&alg_data->adapter.dev, "%s(): Waking up xfer routine.\n", __func__); @@ -276,8 +254,6 @@ static int i2c_pnx_master_xmit(struct i2c_pnx_algo_data *alg_data) ~(mcntrl_afie | mcntrl_naie | mcntrl_drmie), I2C_REG_CTL(alg_data)); - /* Stop timer. */ - del_timer_sync(&alg_data->mif.timer); dev_dbg(&alg_data->adapter.dev, "%s(): Waking up xfer routine after zero-xfer.\n", __func__); @@ -364,8 +340,6 @@ static int i2c_pnx_master_rcv(struct i2c_pnx_algo_data *alg_data) mcntrl_drmie | mcntrl_daie); iowrite32(ctl, I2C_REG_CTL(alg_data)); - /* Kill timer. */ - del_timer_sync(&alg_data->mif.timer); complete(&alg_data->mif.complete); } } @@ -400,8 +374,6 @@ static irqreturn_t i2c_pnx_interrupt(int irq, void *dev_id) mcntrl_drmie); iowrite32(ctl, I2C_REG_CTL(alg_data)); - /* Stop timer, to prevent timeout. */ - del_timer_sync(&alg_data->mif.timer); complete(&alg_data->mif.complete); } else if (stat & mstatus_nai) { /* Slave did not acknowledge, generate a STOP */ @@ -419,8 +391,6 @@ static irqreturn_t i2c_pnx_interrupt(int irq, void *dev_id) /* Our return value. */ alg_data->mif.ret = -EIO; - /* Stop timer, to prevent timeout. */ - del_timer_sync(&alg_data->mif.timer); complete(&alg_data->mif.complete); } else { /* @@ -453,9 +423,8 @@ static irqreturn_t i2c_pnx_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static void i2c_pnx_timeout(struct timer_list *t) +static void i2c_pnx_timeout(struct i2c_pnx_algo_data *alg_data) { - struct i2c_pnx_algo_data *alg_data = from_timer(alg_data, t, mif.timer); u32 ctl; dev_err(&alg_data->adapter.dev, @@ -472,7 +441,6 @@ static void i2c_pnx_timeout(struct timer_list *t) iowrite32(ctl, I2C_REG_CTL(alg_data)); wait_reset(alg_data); alg_data->mif.ret = -EIO; - complete(&alg_data->mif.complete); } static inline void bus_reset_if_active(struct i2c_pnx_algo_data *alg_data) @@ -514,6 +482,7 @@ i2c_pnx_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) struct i2c_msg *pmsg; int rc = 0, completed = 0, i; struct i2c_pnx_algo_data *alg_data = adap->algo_data; + unsigned long time_left; u32 stat; dev_dbg(&alg_data->adapter.dev, @@ -548,7 +517,6 @@ i2c_pnx_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) dev_dbg(&alg_data->adapter.dev, "%s(): mode %d, %d bytes\n", __func__, alg_data->mif.mode, alg_data->mif.len); - i2c_pnx_arm_timer(alg_data); /* initialize the completion var */ init_completion(&alg_data->mif.complete); @@ -564,7 +532,10 @@ i2c_pnx_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) break; /* Wait for completion */ - wait_for_completion(&alg_data->mif.complete); + time_left = wait_for_completion_timeout(&alg_data->mif.complete, + alg_data->timeout); + if (time_left == 0) + i2c_pnx_timeout(alg_data); if (!(rc = alg_data->mif.ret)) completed++; @@ -653,7 +624,10 @@ static int i2c_pnx_probe(struct platform_device *pdev) alg_data->adapter.algo_data = alg_data; alg_data->adapter.nr = pdev->id; - alg_data->timeout = I2C_PNX_TIMEOUT_DEFAULT; + alg_data->timeout = msecs_to_jiffies(I2C_PNX_TIMEOUT_DEFAULT); + if (alg_data->timeout <= 1) + alg_data->timeout = 2; + #ifdef CONFIG_OF alg_data->adapter.dev.of_node = of_node_get(pdev->dev.of_node); if (pdev->dev.of_node) { @@ -673,8 +647,6 @@ static int i2c_pnx_probe(struct platform_device *pdev) if (IS_ERR(alg_data->clk)) return PTR_ERR(alg_data->clk); - timer_setup(&alg_data->mif.timer, i2c_pnx_timeout, 0); - snprintf(alg_data->adapter.name, sizeof(alg_data->adapter.name), "%s", pdev->name); diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c index 828aa2ea0fe4..185a5d60f101 100644 --- a/drivers/i2c/busses/i2c-rcar.c +++ b/drivers/i2c/busses/i2c-rcar.c @@ -257,6 +257,14 @@ static void rcar_i2c_init(struct rcar_i2c_priv *priv) } } +static void rcar_i2c_reset_slave(struct rcar_i2c_priv *priv) +{ + rcar_i2c_write(priv, ICSIER, 0); + rcar_i2c_write(priv, ICSSR, 0); + rcar_i2c_write(priv, ICSCR, SDBS); + rcar_i2c_write(priv, ICSAR, 0); /* Gen2: must be 0 if not using slave */ +} + static int rcar_i2c_bus_barrier(struct rcar_i2c_priv *priv) { int ret; @@ -875,6 +883,10 @@ static int rcar_i2c_do_reset(struct rcar_i2c_priv *priv) { int ret; + /* Don't reset if a slave instance is currently running */ + if (priv->slave) + return -EISCONN; + ret = reset_control_reset(priv->rstc); if (ret) return ret; @@ -903,10 +915,10 @@ static int rcar_i2c_master_xfer(struct i2c_adapter *adap, /* Gen3+ needs a reset. That also allows RXDMA once */ if (priv->devtype >= I2C_RCAR_GEN3) { - priv->flags &= ~ID_P_NO_RXDMA; ret = rcar_i2c_do_reset(priv); if (ret) goto out; + priv->flags &= ~ID_P_NO_RXDMA; } rcar_i2c_init(priv); @@ -1033,11 +1045,8 @@ static int rcar_unreg_slave(struct i2c_client *slave) /* ensure no irq is running before clearing ptr */ disable_irq(priv->irq); - rcar_i2c_write(priv, ICSIER, 0); - rcar_i2c_write(priv, ICSSR, 0); + rcar_i2c_reset_slave(priv); enable_irq(priv->irq); - rcar_i2c_write(priv, ICSCR, SDBS); - rcar_i2c_write(priv, ICSAR, 0); /* Gen2: must be 0 if not using slave */ priv->slave = NULL; @@ -1152,7 +1161,9 @@ static int rcar_i2c_probe(struct platform_device *pdev) goto out_pm_disable; } - rcar_i2c_write(priv, ICSAR, 0); /* Gen2: must be 0 if not using slave */ + /* Bring hardware to known state */ + rcar_i2c_init(priv); + rcar_i2c_reset_slave(priv); if (priv->devtype < I2C_RCAR_GEN3) { irqflags |= IRQF_NO_THREAD; @@ -1168,6 +1179,7 @@ static int rcar_i2c_probe(struct platform_device *pdev) if (of_property_read_bool(dev->of_node, "smbus")) priv->flags |= ID_P_HOST_NOTIFY; + /* R-Car Gen3+ needs a reset before every transfer */ if (priv->devtype >= I2C_RCAR_GEN3) { priv->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL); if (IS_ERR(priv->rstc)) { @@ -1178,6 +1190,9 @@ static int rcar_i2c_probe(struct platform_device *pdev) ret = reset_control_status(priv->rstc); if (ret < 0) goto out_pm_put; + + /* hard reset disturbs HostNotify local target, so disable it */ + priv->flags &= ~ID_P_HOST_NOTIFY; } ret = platform_get_irq(pdev, 0); diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index db0d1ac82910..7e7b15440832 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -1067,6 +1067,7 @@ EXPORT_SYMBOL(i2c_find_device_by_fwnode); static const struct i2c_device_id dummy_id[] = { { "dummy", 0 }, + { "smbus_host_notify", 0 }, { }, }; diff --git a/drivers/i2c/i2c-slave-testunit.c b/drivers/i2c/i2c-slave-testunit.c index ca43e98cae1b..23a11e4e9256 100644 --- a/drivers/i2c/i2c-slave-testunit.c +++ b/drivers/i2c/i2c-slave-testunit.c @@ -118,6 +118,13 @@ static int i2c_slave_testunit_slave_cb(struct i2c_client *client, queue_delayed_work(system_long_wq, &tu->worker, msecs_to_jiffies(10 * tu->regs[TU_REG_DELAY])); } + + /* + * Reset reg_idx to avoid that work gets queued again in case of + * STOP after a following read message. But do not clear TU regs + * here because we still need them in the workqueue! + */ + tu->reg_idx = 0; break; case I2C_SLAVE_WRITE_REQUESTED: diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c index 16de57846bd9..2e84776f4fbd 100644 --- a/drivers/iio/industrialio-trigger.c +++ b/drivers/iio/industrialio-trigger.c @@ -315,7 +315,7 @@ int iio_trigger_attach_poll_func(struct iio_trigger *trig, * this is the case if the IIO device and the trigger device share the * same parent device. */ - if (iio_validate_own_trigger(pf->indio_dev, trig)) + if (!iio_validate_own_trigger(pf->indio_dev, trig)) trig->attached_own_device = true; return ret; diff --git a/drivers/iio/light/apds9306.c b/drivers/iio/light/apds9306.c index d6627b3e6000..66a063ea3db4 100644 --- a/drivers/iio/light/apds9306.c +++ b/drivers/iio/light/apds9306.c @@ -583,8 +583,8 @@ static int apds9306_intg_time_set(struct apds9306_data *data, int val2) return ret; intg_old = iio_gts_find_int_time_by_sel(&data->gts, intg_time_idx); - if (ret < 0) - return ret; + if (intg_old < 0) + return intg_old; if (intg_old == val2) return 0; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4d11fc664cb0..b5d6ef430b86 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -897,7 +897,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, sector_t sectors, struct block_device *cached_bdev, const struct block_device_operations *ops) { - struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, SIZE_MAX / sizeof(atomic_t)); struct queue_limits lim = { @@ -909,6 +908,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, .io_min = block_size, .logical_block_size = block_size, .physical_block_size = block_size, + .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA, }; uint64_t n; int idx; @@ -974,13 +974,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->minors = BCACHE_MINORS; d->disk->fops = ops; d->disk->private_data = d; - - q = d->disk->queue; - - blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); - - blk_queue_write_cache(q, true, true); - return 0; out_bioset_exit: @@ -1423,8 +1416,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) } if (bdev_io_opt(dc->bdev)) - dc->partial_stripes_expensive = - q->limits.raid_partial_stripes_expensive; + dc->partial_stripes_expensive = !!(q->limits.features & + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE); ret = bcache_device_init(&dc->disk, block_size, bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 16884b585053..2d8dd9283ff4 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3403,7 +3403,6 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits) limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; limits->discard_granularity = origin_limits->discard_granularity; limits->discard_alignment = origin_limits->discard_alignment; - limits->discard_misaligned = origin_limits->discard_misaligned; } static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index ad79b52ffc14..b4384a8b13e3 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -2059,7 +2059,6 @@ static void set_discard_limits(struct clone *clone, struct queue_limits *limits) limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; limits->discard_granularity = dest_limits->discard_granularity; limits->discard_alignment = dest_limits->discard_alignment; - limits->discard_misaligned = dest_limits->discard_misaligned; limits->max_discard_segments = dest_limits->max_discard_segments; } diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 08700bfc3e23..14a44c0f8286 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -206,7 +206,6 @@ struct dm_table { bool integrity_supported:1; bool singleton:1; - unsigned integrity_added:1; /* * Indicates the rw permissions for the new logical device. This diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1b7a97cc3779..6c013ceb0e5f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1176,8 +1176,8 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); struct mapped_device *md = dm_table_get_md(ti->table); - /* From now we require underlying device with our integrity profile */ - if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { + /* We require an underlying device with non-PI metadata */ + if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) { ti->error = "Integrity profile not supported."; return -EINVAL; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 417fddebe367..2a89f8eb4713 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -350,25 +350,6 @@ static struct kmem_cache *journal_io_cache; #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif -static void dm_integrity_prepare(struct request *rq) -{ -} - -static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) -{ -} - -/* - * DM Integrity profile, protection is performed layer above (dm-crypt) - */ -static const struct blk_integrity_profile dm_integrity_profile = { - .name = "DM-DIF-EXT-TAG", - .generate_fn = NULL, - .verify_fn = NULL, - .prepare_fn = dm_integrity_prepare, - .complete_fn = dm_integrity_complete, -}; - static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); static void integrity_bio_wait(struct work_struct *w); static void dm_integrity_dtr(struct dm_target *ti); @@ -3494,6 +3475,17 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim limits->dma_alignment = limits->logical_block_size - 1; limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; } + + if (!ic->internal_hash) { + struct blk_integrity *bi = &limits->integrity; + + memset(bi, 0, sizeof(*bi)); + bi->tuple_size = ic->tag_size; + bi->tag_size = bi->tuple_size; + bi->interval_exp = + ic->sb->log2_sectors_per_block + SECTOR_SHIFT; + } + limits->max_integrity_segments = USHRT_MAX; } @@ -3650,20 +3642,6 @@ try_smaller_buffer: return 0; } -static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) -{ - struct gendisk *disk = dm_disk(dm_table_get_md(ti->table)); - struct blk_integrity bi; - - memset(&bi, 0, sizeof(bi)); - bi.profile = &dm_integrity_profile; - bi.tuple_size = ic->tag_size; - bi.tag_size = bi.tuple_size; - bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; - - blk_integrity_register(disk, &bi); -} - static void dm_integrity_free_page_list(struct page_list *pl) { unsigned int i; @@ -4649,9 +4627,6 @@ try_smaller_buffer: } } - if (!ic->internal_hash) - dm_integrity_set(ti, ic); - ti->num_flush_bios = 1; ti->flush_supported = true; if (ic->discard) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index abe88d1e6735..052c00c1eb15 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3542,7 +3542,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, recovery = rs->md.recovery; state = decipher_sync_action(mddev, recovery); progress = rs_get_progress(rs, recovery, state, resync_max_sectors); - resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? + resync_mismatches = mddev->last_sync_action == ACTION_CHECK ? atomic64_read(&mddev->resync_mismatches) : 0; /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index b2d5246cff21..92d18dcdb3e9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -425,6 +425,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, q->limits.logical_block_size, q->limits.alignment_offset, (unsigned long long) start << SECTOR_SHIFT); + + /* + * Only stack the integrity profile if the target doesn't have native + * integrity support. + */ + if (!dm_target_has_integrity(ti->type)) + queue_limits_stack_integrity_bdev(limits, bdev); return 0; } @@ -572,6 +579,12 @@ int dm_split_args(int *argc, char ***argvp, char *input) return 0; } +static void dm_set_stacking_limits(struct queue_limits *limits) +{ + blk_set_stacking_limits(limits); + limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; +} + /* * Impose necessary and sufficient conditions on a devices's table such * that any incoming bio which respects its logical_block_size can be @@ -610,7 +623,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *t, for (i = 0; i < t->num_targets; i++) { ti = dm_table_get_target(t, i); - blk_set_stacking_limits(&ti_limits); + dm_set_stacking_limits(&ti_limits); /* combine all target devices' limits */ if (ti->type->iterate_devices) @@ -702,9 +715,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->immutable_target_type = ti->type; } - if (dm_target_has_integrity(ti->type)) - t->integrity_added = 1; - ti->table = t; ti->begin = start; ti->len = len; @@ -1014,14 +1024,13 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } -static bool dm_table_supports_poll(struct dm_table *t); - static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); unsigned int per_io_data_size = 0, front_pad, io_front_pad; unsigned int min_pool_size = 0, pool_size; struct dm_md_mempools *pools; + unsigned int bioset_flags = 0; if (unlikely(type == DM_TYPE_NONE)) { DMERR("no table type is set, can't allocate mempools"); @@ -1038,6 +1047,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * goto init_bs; } + if (md->queue->limits.features & BLK_FEAT_POLL) + bioset_flags |= BIOSET_PERCPU_CACHE; + for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1050,8 +1062,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - if (bioset_init(&pools->io_bs, pool_size, io_front_pad, - dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0)) + if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; if (t->integrity_supported && bioset_integrity_create(&pools->io_bs, pool_size)) @@ -1119,99 +1130,6 @@ static int dm_table_build_index(struct dm_table *t) return r; } -static bool integrity_profile_exists(struct gendisk *disk) -{ - return !!blk_get_integrity(disk); -} - -/* - * Get a disk whose integrity profile reflects the table's profile. - * Returns NULL if integrity support was inconsistent or unavailable. - */ -static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t) -{ - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd = NULL; - struct gendisk *prev_disk = NULL, *template_disk = NULL; - - for (unsigned int i = 0; i < t->num_targets; i++) { - struct dm_target *ti = dm_table_get_target(t, i); - - if (!dm_target_passes_integrity(ti->type)) - goto no_integrity; - } - - list_for_each_entry(dd, devices, list) { - template_disk = dd->dm_dev->bdev->bd_disk; - if (!integrity_profile_exists(template_disk)) - goto no_integrity; - else if (prev_disk && - blk_integrity_compare(prev_disk, template_disk) < 0) - goto no_integrity; - prev_disk = template_disk; - } - - return template_disk; - -no_integrity: - if (prev_disk) - DMWARN("%s: integrity not set: %s and %s profile mismatch", - dm_device_name(t->md), - prev_disk->disk_name, - template_disk->disk_name); - return NULL; -} - -/* - * Register the mapped device for blk_integrity support if the - * underlying devices have an integrity profile. But all devices may - * not have matching profiles (checking all devices isn't reliable - * during table load because this table may use other DM device(s) which - * must be resumed before they will have an initialized integity - * profile). Consequently, stacked DM devices force a 2 stage integrity - * profile validation: First pass during table load, final pass during - * resume. - */ -static int dm_table_register_integrity(struct dm_table *t) -{ - struct mapped_device *md = t->md; - struct gendisk *template_disk = NULL; - - /* If target handles integrity itself do not register it here. */ - if (t->integrity_added) - return 0; - - template_disk = dm_table_get_integrity_disk(t); - if (!template_disk) - return 0; - - if (!integrity_profile_exists(dm_disk(md))) { - t->integrity_supported = true; - /* - * Register integrity profile during table load; we can do - * this because the final profile must match during resume. - */ - blk_integrity_register(dm_disk(md), - blk_get_integrity(template_disk)); - return 0; - } - - /* - * If DM device already has an initialized integrity - * profile the new profile should not conflict. - */ - if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { - DMERR("%s: conflict with existing integrity profile: %s profile mismatch", - dm_device_name(t->md), - template_disk->disk_name); - return 1; - } - - /* Preserve existing integrity profile */ - t->integrity_supported = true; - return 0; -} - #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct dm_crypto_profile { @@ -1423,12 +1341,6 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_register_integrity(t); - if (r) { - DMERR("could not register integrity profile."); - return r; - } - r = dm_table_construct_crypto_profile(t); if (r) { DMERR("could not construct crypto profile."); @@ -1493,14 +1405,6 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } -static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags); -} - /* * type->iterate_devices() should be called when the sanity check needs to * iterate and check all underlying data devices. iterate_devices() will @@ -1548,19 +1452,6 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, return 0; } -static bool dm_table_supports_poll(struct dm_table *t) -{ - for (unsigned int i = 0; i < t->num_targets; i++) { - struct dm_target *ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) - return false; - } - - return true; -} - /* * Check whether a table has no data devices attached using each * target's iterate_devices method. @@ -1686,12 +1577,20 @@ int dm_calculate_queue_limits(struct dm_table *t, unsigned int zone_sectors = 0; bool zoned = false; - blk_set_stacking_limits(limits); + dm_set_stacking_limits(limits); + + t->integrity_supported = true; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_passes_integrity(ti->type)) + t->integrity_supported = false; + } for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); - blk_set_stacking_limits(&ti_limits); + dm_set_stacking_limits(&ti_limits); if (!ti->type->iterate_devices) { /* Set I/O hints portion of queue limits */ @@ -1706,12 +1605,12 @@ int dm_calculate_queue_limits(struct dm_table *t, ti->type->iterate_devices(ti, dm_set_device_limits, &ti_limits); - if (!zoned && ti_limits.zoned) { + if (!zoned && (ti_limits.features & BLK_FEAT_ZONED)) { /* * After stacking all limits, validate all devices * in table support this zoned model and zone sectors. */ - zoned = ti_limits.zoned; + zoned = (ti_limits.features & BLK_FEAT_ZONED); zone_sectors = ti_limits.chunk_sectors; } @@ -1738,6 +1637,18 @@ combine_limits: dm_device_name(t->md), (unsigned long long) ti->begin, (unsigned long long) ti->len); + + if (t->integrity_supported || + dm_target_has_integrity(ti->type)) { + if (!queue_limits_stack_integrity(limits, &ti_limits)) { + DMWARN("%s: adding target device (start sect %llu len %llu) " + "disabled integrity support due to incompatibility", + dm_device_name(t->md), + (unsigned long long) ti->begin, + (unsigned long long) ti->len); + t->integrity_supported = false; + } + } } /* @@ -1747,12 +1658,12 @@ combine_limits: * zoned model on host-managed zoned block devices. * BUT... */ - if (limits->zoned) { + if (limits->features & BLK_FEAT_ZONED) { /* * ...IF the above limits stacking determined a zoned model * validate that all of the table's devices conform to it. */ - zoned = limits->zoned; + zoned = limits->features & BLK_FEAT_ZONED; zone_sectors = limits->chunk_sectors; } if (validate_hardware_zoned(t, zoned, zone_sectors)) @@ -1762,63 +1673,15 @@ combine_limits: } /* - * Verify that all devices have an integrity profile that matches the - * DM device's registered integrity profile. If the profiles don't - * match then unregister the DM device's integrity profile. + * Check if a target requires flush support even if none of the underlying + * devices need it (e.g. to persist target-specific metadata). */ -static void dm_table_verify_integrity(struct dm_table *t) -{ - struct gendisk *template_disk = NULL; - - if (t->integrity_added) - return; - - if (t->integrity_supported) { - /* - * Verify that the original integrity profile - * matches all the devices in this table. - */ - template_disk = dm_table_get_integrity_disk(t); - if (template_disk && - blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) - return; - } - - if (integrity_profile_exists(dm_disk(t->md))) { - DMWARN("%s: unable to establish an integrity profile", - dm_device_name(t->md)); - blk_integrity_unregister(dm_disk(t->md)); - } -} - -static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static bool dm_table_supports_flush(struct dm_table *t) { - unsigned long flush = (unsigned long) data; - struct request_queue *q = bdev_get_queue(dev->bdev); - - return (q->queue_flags & flush); -} - -static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) -{ - /* - * Require at least one underlying device to support flushes. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting flushes must provide. - */ for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); - if (!ti->num_flush_bios) - continue; - - if (ti->flush_supported) - return true; - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) + if (ti->num_flush_bios && ti->flush_supported) return true; } @@ -1839,20 +1702,6 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, return false; } -static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - return !bdev_nonrot(dev->bdev); -} - -static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return !blk_queue_add_random(q); -} - static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1877,12 +1726,6 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t) return true; } -static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - return !bdev_nowait(dev->bdev); -} - static bool dm_table_supports_nowait(struct dm_table *t) { for (unsigned int i = 0; i < t->num_targets; i++) { @@ -1890,10 +1733,6 @@ static bool dm_table_supports_nowait(struct dm_table *t) if (!dm_target_supports_nowait(ti->type)) return false; - - if (!ti->type->iterate_devices || - ti->type->iterate_devices(ti, device_not_nowait_capable, NULL)) - return false; } return true; @@ -1950,29 +1789,25 @@ static bool dm_table_supports_secure_erase(struct dm_table *t) return true; } -static int device_requires_stable_pages(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) -{ - return bdev_stable_writes(dev->bdev); -} - int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { - bool wc = false, fua = false; int r; - if (dm_table_supports_nowait(t)) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); + if (!dm_table_supports_nowait(t)) + limits->features &= ~BLK_FEAT_NOWAIT; + + /* + * The current polling impementation does not support request based + * stacking. + */ + if (!__table_type_bio_based(t->type)) + limits->features &= ~BLK_FEAT_POLL; if (!dm_table_supports_discards(t)) { limits->max_hw_discard_sectors = 0; limits->discard_granularity = 0; limits->discard_alignment = 0; - limits->discard_misaligned = 0; } if (!dm_table_supports_write_zeroes(t)) @@ -1981,58 +1816,22 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { - wc = true; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) - fua = true; - } - blk_queue_write_cache(q, wc, fua); + if (dm_table_supports_flush(t)) + limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; if (dm_table_supports_dax(t, device_not_dax_capable)) { - blk_queue_flag_set(QUEUE_FLAG_DAX, q); + limits->features |= BLK_FEAT_DAX; if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) set_dax_synchronous(t->md->dax_dev); } else - blk_queue_flag_clear(QUEUE_FLAG_DAX, q); + limits->features &= ~BLK_FEAT_DAX; if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); - /* Ensure that all underlying devices are non-rotational. */ - if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - - dm_table_verify_integrity(t); - - /* - * Some devices don't use blk_integrity but still want stable pages - * because they do their own checksumming. - * If any underlying device requires stable pages, a table must require - * them as well. Only targets that support iterate_devices are considered: - * don't want error, zero, etc to require stable pages. - */ - if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); - else - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); - - /* - * Determine whether or not this queue's I/O timings contribute - * to the entropy pool, Only request-based targets use this. - * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not - * have it set. - */ - if (blk_queue_add_random(q) && - dm_table_any_dev_attr(t, device_is_not_random, NULL)) - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); - - /* - * For a zoned target, setup the zones related queue attributes - * and resources necessary for zone append emulation if necessary. - */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { + /* For a zoned table, setup the zone related queue attributes. */ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (limits->features & BLK_FEAT_ZONED)) { r = dm_set_zones_restrictions(t, q, limits); if (r) return r; @@ -2042,22 +1841,18 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (r) return r; - dm_update_crypto_profile(q, t); - /* - * Check for request-based device is left to - * dm_mq_init_request_queue()->blk_mq_init_allocated_queue(). - * - * For bio-based device, only set QUEUE_FLAG_POLL when all - * underlying devices supporting polling. + * Now that the limits are set, check the zones mapped by the table + * and setup the resources for zone append emulation if necessary. */ - if (__table_type_bio_based(t->type)) { - if (dm_table_supports_poll(t)) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (limits->features & BLK_FEAT_ZONED)) { + r = dm_revalidate_zones(t, q); + if (r) + return r; } + dm_update_crypto_profile(q, t); return 0; } diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c index b423bec6458b..9d51f72a9d66 100644 --- a/drivers/md/dm-vdo/dm-vdo-target.c +++ b/drivers/md/dm-vdo/dm-vdo-target.c @@ -945,7 +945,7 @@ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits) * The value is used by dm-thin to determine whether to pass down discards. The block layer * splits large discards on this boundary when this is set. */ - limits->max_discard_sectors = + limits->max_hw_discard_sectors = (vdo->device_config->max_discard_blocks * VDO_SECTORS_PER_BLOCK); /* diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 5d66d916730e..c0d41c36e06e 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -13,8 +13,6 @@ #define DM_MSG_PREFIX "zone" -#define DM_ZONE_INVALID_WP_OFST UINT_MAX - /* * For internal zone reports bypassing the top BIO submission path. */ @@ -146,34 +144,27 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) } /* - * Count conventional zones of a mapped zoned device. If the device - * only has conventional zones, do not expose it as zoned. - */ -static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - unsigned int *nr_conv_zones = data; - - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - (*nr_conv_zones)++; - - return 0; -} - -/* * Revalidate the zones of a mapped device to initialize resource necessary * for zone append emulation. Note that we cannot simply use the block layer * blk_revalidate_disk_zones() function here as the mapped device is suspended * (this is called from __bind() context). */ -static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) +int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) { + struct mapped_device *md = t->md; struct gendisk *disk = md->disk; int ret; + if (!get_capacity(disk)) + return 0; + /* Revalidate only if something changed. */ - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) + if (!disk->nr_zones || disk->nr_zones != md->nr_zones) { + DMINFO("%s using %s zone append", + disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); md->nr_zones = 0; + } if (md->nr_zones) return 0; @@ -220,13 +211,129 @@ static bool dm_table_supports_zone_append(struct dm_table *t) return true; } +struct dm_device_zone_count { + sector_t start; + sector_t len; + unsigned int total_nr_seq_zones; + unsigned int target_nr_seq_zones; +}; + +/* + * Count the total number of and the number of mapped sequential zones of a + * target zoned device. + */ +static int dm_device_count_zones_cb(struct blk_zone *zone, + unsigned int idx, void *data) +{ + struct dm_device_zone_count *zc = data; + + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + zc->total_nr_seq_zones++; + if (zone->start >= zc->start && + zone->start < zc->start + zc->len) + zc->target_nr_seq_zones++; + } + + return 0; +} + +static int dm_device_count_zones(struct dm_dev *dev, + struct dm_device_zone_count *zc) +{ + int ret; + + ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, + dm_device_count_zones_cb, zc); + if (ret < 0) + return ret; + if (!ret) + return -EIO; + return 0; +} + +struct dm_zone_resource_limits { + unsigned int mapped_nr_seq_zones; + struct queue_limits *lim; + bool reliable_limits; +}; + +static int device_get_zone_resource_limits(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct dm_zone_resource_limits *zlim = data; + struct gendisk *disk = dev->bdev->bd_disk; + unsigned int max_open_zones, max_active_zones; + int ret; + struct dm_device_zone_count zc = { + .start = start, + .len = len, + }; + + /* + * If the target is not the whole device, the device zone resources may + * be shared between different targets. Check this by counting the + * number of mapped sequential zones: if this number is smaller than the + * total number of sequential zones of the target device, then resource + * sharing may happen and the zone limits will not be reliable. + */ + ret = dm_device_count_zones(dev, &zc); + if (ret) { + DMERR("Count %s zones failed %d", disk->disk_name, ret); + return ret; + } + + /* + * If the target does not map any sequential zones, then we do not need + * any zone resource limits. + */ + if (!zc.target_nr_seq_zones) + return 0; + + /* + * If the target does not map all sequential zones, the limits + * will not be reliable and we cannot use REQ_OP_ZONE_RESET_ALL. + */ + if (zc.target_nr_seq_zones < zc.total_nr_seq_zones) { + zlim->reliable_limits = false; + ti->zone_reset_all_supported = false; + } + + /* + * If the target maps less sequential zones than the limit values, then + * we do not have limits for this target. + */ + max_active_zones = disk->queue->limits.max_active_zones; + if (max_active_zones >= zc.target_nr_seq_zones) + max_active_zones = 0; + zlim->lim->max_active_zones = + min_not_zero(max_active_zones, zlim->lim->max_active_zones); + + max_open_zones = disk->queue->limits.max_open_zones; + if (max_open_zones >= zc.target_nr_seq_zones) + max_open_zones = 0; + zlim->lim->max_open_zones = + min_not_zero(max_open_zones, zlim->lim->max_open_zones); + + /* + * Also count the total number of sequential zones for the mapped + * device so that when we are done inspecting all its targets, we are + * able to check if the mapped device actually has any sequential zones. + */ + zlim->mapped_nr_seq_zones += zc.target_nr_seq_zones; + + return 0; +} + int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; - unsigned int nr_conv_zones = 0; - int ret; + struct dm_zone_resource_limits zlim = { + .reliable_limits = true, + .lim = lim, + }; /* * Check if zone append is natively supported, and if not, set the @@ -240,46 +347,63 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, lim->max_zone_append_sectors = 0; } - if (!get_capacity(md->disk)) - return 0; - /* - * Count conventional zones to check that the mapped device will indeed - * have sequential write required zones. + * Determine the max open and max active zone limits for the mapped + * device by inspecting the zone resource limits and the zones mapped + * by each target. */ - md->zone_revalidate_map = t; - ret = dm_blk_report_zones(disk, 0, UINT_MAX, - dm_check_zoned_cb, &nr_conv_zones); - md->zone_revalidate_map = NULL; - if (ret < 0) { - DMERR("Check zoned failed %d", ret); - return ret; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + /* + * Assume that the target can accept REQ_OP_ZONE_RESET_ALL. + * device_get_zone_resource_limits() may adjust this if one of + * the device used by the target does not have all its + * sequential write required zones mapped. + */ + ti->zone_reset_all_supported = true; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, + device_get_zone_resource_limits, &zlim)) { + DMERR("Could not determine %s zone resource limits", + disk->disk_name); + return -ENODEV; + } } /* - * If we only have conventional zones, expose the mapped device as - * a regular device. + * If we only have conventional zones mapped, expose the mapped device + + as a regular device. */ - if (nr_conv_zones >= ret) { + if (!zlim.mapped_nr_seq_zones) { lim->max_open_zones = 0; lim->max_active_zones = 0; - lim->zoned = false; + lim->max_zone_append_sectors = 0; + lim->zone_write_granularity = 0; + lim->chunk_sectors = 0; + lim->features &= ~BLK_FEAT_ZONED; clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + md->nr_zones = 0; disk->nr_zones = 0; return 0; } - if (!md->disk->nr_zones) { - DMINFO("%s using %s zone append", - md->disk->disk_name, - queue_emulates_zone_append(q) ? "emulated" : "native"); - } - - ret = dm_revalidate_zones(md, t); - if (ret < 0) - return ret; + /* + * Warn once (when the capacity is not yet set) if the mapped device is + * partially using zone resources of the target devices as that leads to + * unreliable limits, i.e. if another mapped device uses the same + * underlying devices, we cannot enforce zone limits to guarantee that + * writing will not lead to errors. Note that we really should return + * an error for such case but there is no easy way to find out if + * another mapped device uses the same underlying zoned devices. + */ + if (!get_capacity(disk) && !zlim.reliable_limits) + DMWARN("%s zone resource limits may be unreliable", + disk->disk_name); - if (!static_key_enabled(&zoned_enabled.key)) + if (lim->features & BLK_FEAT_ZONED && + !static_key_enabled(&zoned_enabled.key)) static_branch_enable(&zoned_enabled); return 0; } @@ -306,3 +430,39 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) return; } + +static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + /* + * For an all-zones reset, ignore conventional, empty, read-only + * and offline zones. + */ + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_READONLY: + case BLK_ZONE_COND_OFFLINE: + return 0; + default: + set_bit(idx, (unsigned long *)data); + return 0; + } +} + +int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + unsigned long *need_reset) +{ + int ret; + + ret = dm_blk_do_report_zones(md, t, sector, nr_zones, + dm_zone_need_reset_cb, need_reset); + if (ret != nr_zones) { + DMERR("Get %s zone reset bitmap failed\n", + md->disk->disk_name); + return -EIO; + } + + return 0; +} diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 12236e6f46f3..cd0ee144973f 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1009,7 +1009,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_sectors = chunk_sectors; /* We are exposing a drive-managed zoned block device */ - limits->zoned = false; + limits->features &= ~BLK_FEAT_ZONED; } /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 13037d6a6f62..4b1b69e576a5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1188,7 +1188,7 @@ static sector_t __max_io_len(struct dm_target *ti, sector_t sector, return len; return min_t(sector_t, len, min(max_sectors ? : queue_max_sectors(ti->table->md->queue), - blk_chunk_sectors_left(target_offset, max_granularity))); + blk_boundary_sectors_left(target_offset, max_granularity))); } static inline sector_t max_io_len(struct dm_target *ti, sector_t sector) @@ -1598,20 +1598,19 @@ static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti, static bool is_abnormal_io(struct bio *bio) { - enum req_op op = bio_op(bio); - - if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { - switch (op) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - return true; - default: - break; - } + switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + case REQ_OP_FLUSH: + return false; + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + case REQ_OP_ZONE_RESET_ALL: + return true; + default: + return false; } - - return false; } static blk_status_t __process_abnormal_io(struct clone_info *ci, @@ -1776,6 +1775,119 @@ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); } + +static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, + struct dm_target *ti) +{ + struct bio_list blist = BIO_EMPTY_LIST; + struct mapped_device *md = ci->io->md; + unsigned int zone_sectors = md->disk->queue->limits.chunk_sectors; + unsigned long *need_reset; + unsigned int i, nr_zones, nr_reset; + unsigned int num_bios = 0; + blk_status_t sts = BLK_STS_OK; + sector_t sector = ti->begin; + struct bio *clone; + int ret; + + nr_zones = ti->len >> ilog2(zone_sectors); + need_reset = bitmap_zalloc(nr_zones, GFP_NOIO); + if (!need_reset) + return BLK_STS_RESOURCE; + + ret = dm_zone_get_reset_bitmap(md, ci->map, ti->begin, + nr_zones, need_reset); + if (ret) { + sts = BLK_STS_IOERR; + goto free_bitmap; + } + + /* If we have no zone to reset, we are done. */ + nr_reset = bitmap_weight(need_reset, nr_zones); + if (!nr_reset) + goto free_bitmap; + + atomic_add(nr_zones, &ci->io->io_count); + + for (i = 0; i < nr_zones; i++) { + + if (!test_bit(i, need_reset)) { + sector += zone_sectors; + continue; + } + + if (bio_list_empty(&blist)) { + /* This may take a while, so be nice to others */ + if (num_bios) + cond_resched(); + + /* + * We may need to reset thousands of zones, so let's + * not go crazy with the clone allocation. + */ + alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32), + NULL, GFP_NOIO); + } + + /* Get a clone and change it to a regular reset operation. */ + clone = bio_list_pop(&blist); + clone->bi_opf &= ~REQ_OP_MASK; + clone->bi_opf |= REQ_OP_ZONE_RESET | REQ_SYNC; + clone->bi_iter.bi_sector = sector; + clone->bi_iter.bi_size = 0; + __map_bio(clone); + + sector += zone_sectors; + num_bios++; + nr_reset--; + } + + WARN_ON_ONCE(!bio_list_empty(&blist)); + atomic_sub(nr_zones - num_bios, &ci->io->io_count); + ci->sector_count = 0; + +free_bitmap: + bitmap_free(need_reset); + + return sts; +} + +static void __send_zone_reset_all_native(struct clone_info *ci, + struct dm_target *ti) +{ + unsigned int bios; + + atomic_add(1, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, 1, NULL, GFP_NOIO); + atomic_sub(1 - bios, &ci->io->io_count); + + ci->sector_count = 0; +} + +static blk_status_t __send_zone_reset_all(struct clone_info *ci) +{ + struct dm_table *t = ci->map; + blk_status_t sts = BLK_STS_OK; + + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (ti->zone_reset_all_supported) { + __send_zone_reset_all_native(ci, ti); + continue; + } + + sts = __send_zone_reset_all_emulated(ci, ti); + if (sts != BLK_STS_OK) + break; + } + + /* Release the reference that alloc_io() took for submission. */ + atomic_sub(1, &ci->io->io_count); + + return sts; +} + #else static inline bool dm_zone_bio_needs_split(struct mapped_device *md, struct bio *bio) @@ -1786,6 +1898,10 @@ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { return false; } +static blk_status_t __send_zone_reset_all(struct clone_info *ci) +{ + return BLK_STS_NOTSUPP; +} #endif /* @@ -1799,9 +1915,14 @@ static void dm_split_and_process_bio(struct mapped_device *md, blk_status_t error = BLK_STS_OK; bool is_abnormal, need_split; - need_split = is_abnormal = is_abnormal_io(bio); - if (static_branch_unlikely(&zoned_enabled)) - need_split = is_abnormal || dm_zone_bio_needs_split(md, bio); + is_abnormal = is_abnormal_io(bio); + if (static_branch_unlikely(&zoned_enabled)) { + /* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */ + need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) && + (is_abnormal || dm_zone_bio_needs_split(md, bio)); + } else { + need_split = is_abnormal; + } if (unlikely(need_split)) { /* @@ -1842,6 +1963,12 @@ static void dm_split_and_process_bio(struct mapped_device *md, goto out; } + if (static_branch_unlikely(&zoned_enabled) && + (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) { + error = __send_zone_reset_all(&ci); + goto out; + } + error = __split_and_process_bio(&ci); if (error || !ci.sector_count) goto out; @@ -2386,22 +2513,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) struct table_device *td; int r; - switch (type) { - case DM_TYPE_REQUEST_BASED: + WARN_ON_ONCE(type == DM_TYPE_NONE); + + if (type == DM_TYPE_REQUEST_BASED) { md->disk->fops = &dm_rq_blk_dops; r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm mapped device"); return r; } - break; - case DM_TYPE_BIO_BASED: - case DM_TYPE_DAX_BIO_BASED: - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, md->queue); - break; - case DM_TYPE_NONE: - WARN_ON_ONCE(true); - break; } r = dm_calculate_queue_limits(t, &limits); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 53ef8207fe2c..cc466ad5cb1d 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -103,12 +103,16 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); */ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim); +int dm_revalidate_zones(struct dm_table *t, struct request_queue *q); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_map_bio(struct dm_target_io *io); +int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, + sector_t sector, unsigned int nr_zones, + unsigned long *need_reset); #else #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 0a2d37eb38ef..08232d8dc815 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -227,6 +227,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; + unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) << + PAGE_SHIFT; loff_t sboff, offset = mddev->bitmap_info.offset; sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; @@ -269,11 +271,9 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, if (size == 0) /* bitmap runs in to data */ return -EINVAL; - } else { - /* DATA METADATA BITMAP - no problems */ } - md_super_write(mddev, rdev, sboff + ps, (int) size, page); + md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); return 0; } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 8e36a0feec09..139fe2019c1d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1570,7 +1570,7 @@ out: return err; } -static struct md_cluster_operations cluster_ops = { +static const struct md_cluster_operations cluster_ops = { .join = join, .leave = leave, .slot_number = slot_number, diff --git a/drivers/md/md.c b/drivers/md/md.c index aff9118ff697..64693913ed18 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,13 +69,23 @@ #include "md-bitmap.h" #include "md-cluster.h" +static const char *action_name[NR_SYNC_ACTIONS] = { + [ACTION_RESYNC] = "resync", + [ACTION_RECOVER] = "recover", + [ACTION_CHECK] = "check", + [ACTION_REPAIR] = "repair", + [ACTION_RESHAPE] = "reshape", + [ACTION_FROZEN] = "frozen", + [ACTION_IDLE] = "idle", +}; + /* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); static const struct kobj_type md_ktype; -struct md_cluster_operations *md_cluster_ops; +const struct md_cluster_operations *md_cluster_ops; EXPORT_SYMBOL(md_cluster_ops); static struct module *md_cluster_mod; @@ -479,7 +489,6 @@ int mddev_suspend(struct mddev *mddev, bool interruptible) */ WRITE_ONCE(mddev->suspended, mddev->suspended + 1); - del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); @@ -550,13 +559,9 @@ static void md_end_flush(struct bio *bio) rdev_dec_pending(rdev, mddev); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); - + if (atomic_dec_and_test(&mddev->flush_pending)) /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); - } } static void md_submit_flush_data(struct work_struct *ws); @@ -587,12 +592,8 @@ static void submit_flushes(struct work_struct *ws) rcu_read_lock(); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pair is percpu_ref_get() from md_flush_request() */ - percpu_ref_put(&mddev->active_io); - + if (atomic_dec_and_test(&mddev->flush_pending)) queue_work(md_wq, &mddev->flush_work); - } } static void md_submit_flush_data(struct work_struct *ws) @@ -617,8 +618,20 @@ static void md_submit_flush_data(struct work_struct *ws) bio_endio(bio); } else { bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); + + /* + * make_requst() will never return error here, it only + * returns error in raid5_make_request() by dm-raid. + * Since dm always splits data and flush operation into + * two separate io, io size of flush submitted by dm + * always is 0, make_request() will not be called here. + */ + if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) + bio_io_error(bio); } + + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); } /* @@ -654,24 +667,22 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) WARN_ON(percpu_ref_is_zero(&mddev->active_io)); percpu_ref_get(&mddev->active_io); mddev->flush_bio = bio; - bio = NULL; - } - spin_unlock_irq(&mddev->lock); - - if (!bio) { + spin_unlock_irq(&mddev->lock); INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); - } else { - /* flush was performed for some other bio while we waited. */ - if (bio->bi_iter.bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio); - else { - bio->bi_opf &= ~REQ_PREFLUSH; - return false; - } + return true; } - return true; + + /* flush was performed for some other bio while we waited. */ + spin_unlock_irq(&mddev->lock); + if (bio->bi_iter.bi_size == 0) { + /* pure flush without data - all done */ + bio_endio(bio); + return true; + } + + bio->bi_opf &= ~REQ_PREFLUSH; + return false; } EXPORT_SYMBOL(md_flush_request); @@ -742,7 +753,6 @@ int mddev_init(struct mddev *mddev) mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); - mutex_init(&mddev->sync_mutex); mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); @@ -758,7 +768,7 @@ int mddev_init(struct mddev *mddev) init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; - mddev->last_sync_action = "none"; + mddev->last_sync_action = ACTION_IDLE; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; @@ -2410,36 +2420,10 @@ static LIST_HEAD(pending_raid_disks); */ int md_integrity_register(struct mddev *mddev) { - struct md_rdev *rdev, *reference = NULL; - if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) - return 0; /* shouldn't register, or already is */ - rdev_for_each(rdev, mddev) { - /* skip spares and non-functional disks */ - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->raid_disk < 0) - continue; - if (!reference) { - /* Use the first rdev as the reference */ - reference = rdev; - continue; - } - /* does this rdev's profile match the reference profile? */ - if (blk_integrity_compare(reference->bdev->bd_disk, - rdev->bdev->bd_disk) < 0) - return -EINVAL; - } - if (!reference || !bdev_get_integrity(reference->bdev)) - return 0; - /* - * All component devices are integrity capable and have matching - * profiles, register the common profile for the md device. - */ - blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)); + if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || @@ -2459,32 +2443,6 @@ int md_integrity_register(struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_register); -/* - * Attempt to add an rdev, but only if it is consistent with the current - * integrity profile - */ -int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - struct blk_integrity *bi_mddev; - - if (mddev_is_dm(mddev)) - return 0; - - bi_mddev = blk_get_integrity(mddev->gendisk); - - if (!bi_mddev) /* nothing to do */ - return 0; - - if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { - pr_err("%s: incompatible integrity profile for %pg\n", - mdname(mddev), rdev->bdev); - return -ENXIO; - } - - return 0; -} -EXPORT_SYMBOL(md_integrity_add_rdev); - static bool rdev_read_only(struct md_rdev *rdev) { return bdev_read_only(rdev->bdev) || @@ -4867,30 +4825,81 @@ out_unlock: static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); -static ssize_t -action_show(struct mddev *mddev, char *page) +enum sync_action md_sync_action(struct mddev *mddev) { - char *type = "idle"; unsigned long recovery = mddev->recovery; + + /* + * frozen has the highest priority, means running sync_thread will be + * stopped immediately, and no new sync_thread can start. + */ if (test_bit(MD_RECOVERY_FROZEN, &recovery)) - type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) - type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) - type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &recovery)) - type = "check"; - else - type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) - type = "recover"; - else if (mddev->reshape_position != MaxSector) - type = "reshape"; + return ACTION_FROZEN; + + /* + * read-only array can't register sync_thread, and it can only + * add/remove spares. + */ + if (!md_is_rdwr(mddev)) + return ACTION_IDLE; + + /* + * idle means no sync_thread is running, and no new sync_thread is + * requested. + */ + if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && + !test_bit(MD_RECOVERY_NEEDED, &recovery)) + return ACTION_IDLE; + + if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || + mddev->reshape_position != MaxSector) + return ACTION_RESHAPE; + + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + return ACTION_RECOVER; + + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + /* + * MD_RECOVERY_CHECK must be paired with + * MD_RECOVERY_REQUESTED. + */ + if (test_bit(MD_RECOVERY_CHECK, &recovery)) + return ACTION_CHECK; + if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) + return ACTION_REPAIR; + return ACTION_RESYNC; } - return sprintf(page, "%s\n", type); + + /* + * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no + * sync_action is specified. + */ + return ACTION_IDLE; +} + +enum sync_action md_sync_action_by_name(const char *page) +{ + enum sync_action action; + + for (action = 0; action < NR_SYNC_ACTIONS; ++action) { + if (cmd_match(page, action_name[action])) + return action; + } + + return NR_SYNC_ACTIONS; +} + +const char *md_sync_action_name(enum sync_action action) +{ + return action_name[action]; +} + +static ssize_t +action_show(struct mddev *mddev, char *page) +{ + enum sync_action action = md_sync_action(mddev); + + return sprintf(page, "%s\n", md_sync_action_name(action)); } /** @@ -4899,15 +4908,10 @@ action_show(struct mddev *mddev, char *page) * @locked: if set, reconfig_mutex will still be held after this function * return; if not set, reconfig_mutex will be released after this * function return. - * @check_seq: if set, only wait for curent running sync_thread to stop, noted - * that new sync_thread can still start. */ -static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) +static void stop_sync_thread(struct mddev *mddev, bool locked) { - int sync_seq; - - if (check_seq) - sync_seq = atomic_read(&mddev->sync_seq); + int sync_seq = atomic_read(&mddev->sync_seq); if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { if (!locked) @@ -4928,7 +4932,8 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); + (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && + sync_seq != atomic_read(&mddev->sync_seq))); if (locked) mddev_lock_nointr(mddev); @@ -4939,7 +4944,7 @@ void md_idle_sync_thread(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, true); + stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_idle_sync_thread); @@ -4948,7 +4953,7 @@ void md_frozen_sync_thread(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); } EXPORT_SYMBOL_GPL(md_frozen_sync_thread); @@ -4963,100 +4968,127 @@ void md_unfrozen_sync_thread(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); -static void idle_sync_thread(struct mddev *mddev) +static int mddev_start_reshape(struct mddev *mddev) { - mutex_lock(&mddev->sync_mutex); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - if (mddev_lock(mddev)) { - mutex_unlock(&mddev->sync_mutex); - return; - } - - stop_sync_thread(mddev, false, true); - mutex_unlock(&mddev->sync_mutex); -} + int ret; -static void frozen_sync_thread(struct mddev *mddev) -{ - mutex_lock(&mddev->sync_mutex); - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (mddev->pers->start_reshape == NULL) + return -EINVAL; - if (mddev_lock(mddev)) { - mutex_unlock(&mddev->sync_mutex); - return; + if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev->pers->start_reshape(mddev); + if (ret) + return ret; + } else { + /* + * If reshape is still in progress, and md_check_recovery() can + * continue to reshape, don't restart reshape because data can + * be corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, false, false); - mutex_unlock(&mddev->sync_mutex); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return 0; } static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { + int ret; + enum sync_action action; + if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; +retry: + if (work_busy(&mddev->sync_work)) + flush_work(&mddev->sync_work); - if (cmd_match(page, "idle")) - idle_sync_thread(mddev); - else if (cmd_match(page, "frozen")) - frozen_sync_thread(mddev); - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; - else if (cmd_match(page, "resync")) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else if (cmd_match(page, "recover")) { - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if (cmd_match(page, "reshape")) { - int err; - if (mddev->pers->start_reshape == NULL) - return -EINVAL; - err = mddev_lock(mddev); - if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - err = -EBUSY; - } else if (mddev->reshape_position == MaxSector || - mddev->pers->check_reshape == NULL || - mddev->pers->check_reshape(mddev)) { - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - err = mddev->pers->start_reshape(mddev); - } else { - /* - * If reshape is still in progress, and - * md_check_recovery() can continue to reshape, - * don't restart reshape because data can be - * corrupted for raid456. - */ - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } - mddev_unlock(mddev); + ret = mddev_lock(mddev); + if (ret) + return ret; + + if (work_busy(&mddev->sync_work)) { + mddev_unlock(mddev); + goto retry; + } + + action = md_sync_action_by_name(page); + + /* TODO: mdadm rely on "idle" to start sync_thread. */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + switch (action) { + case ACTION_FROZEN: + md_frozen_sync_thread(mddev); + ret = len; + goto out; + case ACTION_IDLE: + md_idle_sync_thread(mddev); + break; + case ACTION_RESHAPE: + case ACTION_RECOVER: + case ACTION_CHECK: + case ACTION_REPAIR: + case ACTION_RESYNC: + ret = -EBUSY; + goto out; + default: + ret = -EINVAL; + goto out; } - if (err) - return err; - sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { - if (cmd_match(page, "check")) + switch (action) { + case ACTION_FROZEN: + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = len; + goto out; + case ACTION_RESHAPE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + ret = mddev_start_reshape(mddev); + if (ret) + goto out; + break; + case ACTION_RECOVER: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + break; + case ACTION_CHECK: set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!cmd_match(page, "repair")) - return -EINVAL; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_REPAIR: + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + fallthrough; + case ACTION_RESYNC: + case ACTION_IDLE: + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + break; + default: + ret = -EINVAL; + goto out; + } } + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - flush_work(&mddev->sync_work); mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); - return len; + ret = len; + +out: + mddev_unlock(mddev); + return ret; } static struct md_sysfs_entry md_scan_mode = @@ -5065,7 +5097,8 @@ __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) { - return sprintf(page, "%s\n", mddev->last_sync_action); + return sprintf(page, "%s\n", + md_sync_action_name(mddev->last_sync_action)); } static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); @@ -5755,14 +5788,20 @@ static const struct kobj_type md_ktype = { int mdp_major = 0; /* stack the limit for all rdevs into lim */ -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) { queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + if ((flags & MDDEV_STACK_INTEGRITY) && + !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) + return -EINVAL; } + + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -5777,6 +5816,14 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + + if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { + pr_err("%s: incompatible integrity profile for %pg\n", + mdname(mddev), rdev->bdev); + queue_limits_cancel_update(mddev->gendisk->queue); + return -ENXIO; + } + return queue_limits_commit_update(mddev->gendisk->queue, &lim); } EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); @@ -5806,6 +5853,14 @@ static void mddev_delayed_delete(struct work_struct *ws) kobject_put(&mddev->kobj); } +void md_init_stacking_limits(struct queue_limits *lim) +{ + blk_set_stacking_limits(lim); + lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; +} +EXPORT_SYMBOL_GPL(md_init_stacking_limits); + struct mddev *md_alloc(dev_t dev, char *name) { /* @@ -5823,7 +5878,7 @@ struct mddev *md_alloc(dev_t dev, char *name) int partitioned; int shift; int unit; - int error ; + int error; /* * Wait for any previous instance of this device to be completely @@ -5881,7 +5936,6 @@ struct mddev *md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; - blk_queue_write_cache(disk->queue, true, true); disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); @@ -6185,28 +6239,6 @@ int md_run(struct mddev *mddev) } } - if (!mddev_is_dm(mddev)) { - struct request_queue *q = mddev->gendisk->queue; - bool nonrot = true; - - rdev_for_each(rdev, mddev) { - if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { - nonrot = false; - break; - } - } - if (mddev->degraded) - nonrot = false; - if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); - - /* Set the NOWAIT flags if all underlying devices support it */ - if (nowait) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); - } if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) @@ -6437,7 +6469,7 @@ void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -6505,7 +6537,7 @@ static int md_set_readonly(struct mddev *mddev) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, false, false); + stop_sync_thread(mddev, false); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); @@ -6551,7 +6583,7 @@ static int do_md_stop(struct mddev *mddev, int mode) set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - stop_sync_thread(mddev, true, false); + stop_sync_thread(mddev, true); if (mddev->sysfs_active || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { @@ -7166,15 +7198,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) if (!mddev->thread) md_update_sb(mddev, 1); /* - * If the new disk does not support REQ_NOWAIT, - * disable on the whole MD. - */ - if (!bdev_nowait(rdev->bdev)) { - pr_info("%s: Disabling nowait because %pg does not support nowait\n", - mdname(mddev), rdev->bdev); - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue); - } - /* * Kick recovery, maybe this spare has to be added to the * array immediately. */ @@ -7742,12 +7765,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, return get_bitmap_file(mddev, argp); } - if (cmd == HOT_REMOVE_DISK) - /* need to ensure recovery thread has run */ - wait_event_interruptible_timeout(mddev->sb_wait, - !test_bit(MD_RECOVERY_NEEDED, - &mddev->recovery), - msecs_to_jiffies(5000)); if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { /* Need to flush page cache, and ensure no-one else opens * and writes @@ -8520,7 +8537,7 @@ int unregister_md_personality(struct md_personality *p) } EXPORT_SYMBOL(unregister_md_personality); -int register_md_cluster_operations(struct md_cluster_operations *ops, +int register_md_cluster_operations(const struct md_cluster_operations *ops, struct module *module) { int ret = 0; @@ -8641,12 +8658,12 @@ EXPORT_SYMBOL(md_done_sync); * A return value of 'false' means that the write wasn't recorded * and cannot proceed as the array is being suspend. */ -bool md_write_start(struct mddev *mddev, struct bio *bi) +void md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) - return true; + return; BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { @@ -8679,15 +8696,9 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); if (!mddev->has_superblocks) - return true; + return; wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || - is_md_suspended(mddev)); - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - percpu_ref_put(&mddev->writes_pending); - return false; - } - return true; + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } EXPORT_SYMBOL(md_write_start); @@ -8835,6 +8846,77 @@ void md_allow_write(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_allow_write); +static sector_t md_sync_max_sectors(struct mddev *mddev, + enum sync_action action) +{ + switch (action) { + case ACTION_RESYNC: + case ACTION_CHECK: + case ACTION_REPAIR: + atomic64_set(&mddev->resync_mismatches, 0); + fallthrough; + case ACTION_RESHAPE: + return mddev->resync_max_sectors; + case ACTION_RECOVER: + return mddev->dev_sectors; + default: + return 0; + } +} + +static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) +{ + sector_t start = 0; + struct md_rdev *rdev; + + switch (action) { + case ACTION_CHECK: + case ACTION_REPAIR: + return mddev->resync_min; + case ACTION_RESYNC: + if (!mddev->bitmap) + return mddev->recovery_cp; + return 0; + case ACTION_RESHAPE: + /* + * If the original node aborts reshaping then we continue the + * reshaping, so set again to avoid restart reshape from the + * first beginning + */ + if (mddev_is_clustered(mddev) && + mddev->reshape_position != MaxSector) + return mddev->reshape_position; + return 0; + case ACTION_RECOVER: + start = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < start) + start = rdev->recovery_offset; + rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return start; + default: + return MaxSector; + } +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -8851,7 +8933,8 @@ void md_do_sync(struct md_thread *thread) sector_t last_check; int skipped = 0; struct md_rdev *rdev; - char *desc, *action = NULL; + enum sync_action action; + const char *desc; struct blk_plug plug; int ret; @@ -8882,21 +8965,9 @@ void md_do_sync(struct md_thread *thread) goto skip; } - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { - desc = "data-check"; - action = "check"; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - desc = "requested-resync"; - action = "repair"; - } else - desc = "resync"; - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - desc = "reshape"; - else - desc = "recovery"; - - mddev->last_sync_action = action ?: desc; + action = md_sync_action(mddev); + desc = md_sync_action_name(action); + mddev->last_sync_action = action; /* * Before starting a resync we must have set curr_resync to @@ -8964,56 +9035,8 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&all_mddevs_lock); } while (mddev->curr_resync < MD_RESYNC_DELAYED); - j = 0; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* resync follows the size requested by the personality, - * which defaults to physical size, but can be virtual size - */ - max_sectors = mddev->resync_max_sectors; - atomic64_set(&mddev->resync_mismatches, 0); - /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - j = mddev->resync_min; - else if (!mddev->bitmap) - j = mddev->recovery_cp; - - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { - max_sectors = mddev->resync_max_sectors; - /* - * If the original node aborts reshaping then we continue the - * reshaping, so set j again to avoid restart reshape from the - * first beginning - */ - if (mddev_is_clustered(mddev) && - mddev->reshape_position != MaxSector) - j = mddev->reshape_position; - } else { - /* recovery follows the physical size of devices */ - max_sectors = mddev->dev_sectors; - j = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < j) - j = rdev->recovery_offset; - rcu_read_unlock(); - - /* If there is a bitmap, we need to make sure all - * writes that started before we added a spare - * complete before we start doing a recovery. - * Otherwise the write might complete and (via - * bitmap_endwrite) set a bit in the bitmap after the - * recovery has checked that bit and skipped that - * region. - */ - if (mddev->bitmap) { - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } - } + max_sectors = md_sync_max_sectors(mddev, action); + j = md_sync_position(mddev, action); pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); @@ -9095,7 +9118,8 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; - sectors = mddev->pers->sync_request(mddev, j, &skipped); + sectors = mddev->pers->sync_request(mddev, j, max_sectors, + &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; @@ -9185,7 +9209,7 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } - mddev->pers->sync_request(mddev, max_sectors, &skipped); + mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > MD_RESYNC_ACTIVE) { diff --git a/drivers/md/md.h b/drivers/md/md.h index ca085ecad504..a0d6827dced9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -34,6 +34,61 @@ */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) +/* Status of sync thread. */ +enum sync_action { + /* + * Represent by MD_RECOVERY_SYNC, start when: + * 1) after assemble, sync data from first rdev to other copies, this + * must be done first before other sync actions and will only execute + * once; + * 2) resize the array(notice that this is not reshape), sync data for + * the new range; + */ + ACTION_RESYNC, + /* + * Represent by MD_RECOVERY_RECOVER, start when: + * 1) for new replacement, sync data based on the replace rdev or + * available copies from other rdev; + * 2) for new member disk while the array is degraded, sync data from + * other rdev; + * 3) reassemble after power failure or re-add a hot removed rdev, sync + * data from first rdev to other copies based on bitmap; + */ + ACTION_RECOVER, + /* + * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED | + * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api + * sync_action, used to check if data copies from differenct rdev are + * the same. The number of mismatch sectors will be exported to user + * by sysfs api mismatch_cnt; + */ + ACTION_CHECK, + /* + * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when + * user echo "repair" to sysfs api sync_action, usually paired with + * ACTION_CHECK, used to force syncing data once user found that there + * are inconsistent data, + */ + ACTION_REPAIR, + /* + * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added + * to the conf, notice that this is different from spares or + * replacement; + */ + ACTION_RESHAPE, + /* + * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action + * or internal usage like setting the array read-only, will forbid above + * actions. + */ + ACTION_FROZEN, + /* + * All above actions don't match. + */ + ACTION_IDLE, + NR_SYNC_ACTIONS, +}; + /* * The struct embedded in rdev is used to serialize IO. */ @@ -371,13 +426,12 @@ struct mddev { struct md_thread __rcu *thread; /* management thread */ struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ - /* 'last_sync_action' is initialized to "none". It is set when a - * sync operation (i.e "data-check", "requested-resync", "resync", - * "recovery", or "reshape") is started. It holds this value even + /* + * Set when a sync operation is started. It holds this value even * when the sync thread is "frozen" (interrupted) or "idle" (stopped - * or finished). It is overwritten when a new sync operation is begun. + * or finished). It is overwritten when a new sync operation is begun. */ - char *last_sync_action; + enum sync_action last_sync_action; sector_t curr_resync; /* last block scheduled */ /* As resync requests can complete out of order, we cannot easily track * how much resync has been completed. So we occasionally pause until @@ -540,8 +594,6 @@ struct mddev { */ struct list_head deleting; - /* Used to synchronize idle and frozen for action_store() */ - struct mutex sync_mutex; /* The sequence number for sync thread */ atomic_t sync_seq; @@ -551,22 +603,46 @@ struct mddev { }; enum recovery_flags { + /* flags for sync thread running status */ + + /* + * set when one of sync action is set and new sync thread need to be + * registered, or just add/remove spares from conf. + */ + MD_RECOVERY_NEEDED, + /* sync thread is running, or about to be started */ + MD_RECOVERY_RUNNING, + /* sync thread needs to be aborted for some reason */ + MD_RECOVERY_INTR, + /* sync thread is done and is waiting to be unregistered */ + MD_RECOVERY_DONE, + /* running sync thread must abort immediately, and not restart */ + MD_RECOVERY_FROZEN, + /* waiting for pers->start() to finish */ + MD_RECOVERY_WAIT, + /* interrupted because io-error */ + MD_RECOVERY_ERROR, + + /* flags determines sync action, see details in enum sync_action */ + + /* if just this flag is set, action is resync. */ + MD_RECOVERY_SYNC, + /* + * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set, + * action is repair, means user requested resync. + */ + MD_RECOVERY_REQUESTED, /* - * If neither SYNC or RESHAPE are set, then it is a recovery. + * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is + * check. */ - MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */ - MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */ - MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */ - MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */ - MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */ - MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */ - MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */ - MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */ - MD_RECOVERY_RESHAPE, /* A reshape is happening */ - MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ - MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ - MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */ - MD_RESYNCING_REMOTE, /* remote node is running resync thread */ + MD_RECOVERY_CHECK, + /* recovery, or need to try it */ + MD_RECOVERY_RECOVER, + /* reshape */ + MD_RECOVERY_RESHAPE, + /* remote node is running resync thread */ + MD_RESYNCING_REMOTE, }; enum md_ro_state { @@ -653,7 +729,8 @@ struct md_personality int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); - sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped); + sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, + sector_t max_sector, int *skipped); int (*resize) (struct mddev *mddev, sector_t sectors); sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (struct mddev *mddev); @@ -772,7 +849,7 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); -extern int register_md_cluster_operations(struct md_cluster_operations *ops, +extern int register_md_cluster_operations(const struct md_cluster_operations *ops, struct module *module); extern int unregister_md_cluster_operations(void); extern int md_setup_cluster(struct mddev *mddev, int nodes); @@ -785,7 +862,10 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); -extern bool md_write_start(struct mddev *mddev, struct bio *bi); +extern enum sync_action md_sync_action(struct mddev *mddev); +extern enum sync_action md_sync_action_by_name(const char *page); +extern const char *md_sync_action_name(enum sync_action action); +extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); @@ -809,11 +889,11 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); -extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int mddev_init(struct mddev *mddev); extern void mddev_destroy(struct mddev *mddev); +void md_init_stacking_limits(struct queue_limits *lim); struct mddev *md_alloc(dev_t dev, char *name); void mddev_put(struct mddev *mddev); extern int md_run(struct mddev *mddev); @@ -852,7 +932,7 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) } } -extern struct md_cluster_operations *md_cluster_ops; +extern const struct md_cluster_operations *md_cluster_ops; static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; @@ -908,7 +988,9 @@ void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim); +#define MDDEV_STACK_INTEGRITY (1u << 0) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags); int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c5d4aeb68404..32d587524778 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -365,30 +365,30 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static void free_conf(struct mddev *mddev, struct r0conf *conf) -{ - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); -} - static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - free_conf(mddev, conf); + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); } static int raid0_set_limits(struct mddev *mddev) { struct queue_limits lim; + int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } @@ -415,7 +415,7 @@ static int raid0_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid0_set_limits(mddev); if (ret) - goto out_free_conf; + return ret; } /* calculate array device size */ @@ -427,13 +427,7 @@ static int raid0_run(struct mddev *mddev) dump_zones(mddev); - ret = md_integrity_register(mddev); - if (ret) - goto out_free_conf; - return 0; -out_free_conf: - free_conf(mddev, conf); - return ret; + return md_integrity_register(mddev); } /* diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7b8a71ca66dd..04a0c2ca1732 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1687,8 +1687,7 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) if (bio_data_dir(bio) == READ) raid1_read_request(mddev, bio, sectors, NULL); else { - if (!md_write_start(mddev,bio)) - return false; + md_write_start(mddev,bio); raid1_write_request(mddev, bio, sectors); } return true; @@ -1907,9 +1906,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -2757,12 +2753,12 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) */ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; struct bio *bio; - sector_t max_sector, nr_sectors; + sector_t nr_sectors; int disk = -1; int i; int wonly = -1; @@ -2778,7 +2774,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; - max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -3197,14 +3192,18 @@ static struct r1conf *setup_conf(struct mddev *mddev) static int raid1_set_limits(struct mddev *mddev) { struct queue_limits lim; + int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } -static void raid1_free(struct mddev *mddev, void *priv); static int raid1_run(struct mddev *mddev) { struct r1conf *conf; @@ -3238,7 +3237,7 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); if (ret) - goto abort; + return ret; } mddev->degraded = 0; @@ -3252,8 +3251,7 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); - ret = -EINVAL; - goto abort; + return -EINVAL; } if (conf->raid_disks - mddev->degraded == 1) @@ -3277,14 +3275,8 @@ static int raid1_run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); ret = md_integrity_register(mddev); - if (ret) { + if (ret) md_unregister_thread(mddev, &mddev->thread); - goto abort; - } - return 0; - -abort: - raid1_free(mddev, conf); return ret; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a4556d2e46bf..2a9c4ee982e0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1836,8 +1836,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) && md_flush_request(mddev, bio)) return true; - if (!md_write_start(mddev, bio)) - return false; + md_write_start(mddev, bio); if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) if (!raid10_handle_discard(mddev, bio)) @@ -2083,9 +2082,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -3140,12 +3136,12 @@ static void raid10_set_cluster_sync_high(struct r10conf *conf) */ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r10conf *conf = mddev->private; struct r10bio *r10_bio; struct bio *biolist = NULL, *bio; - sector_t max_sector, nr_sectors; + sector_t nr_sectors; int i; int max_sync; sector_t sync_blocks; @@ -3175,10 +3171,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return 0; skipped: - max_sector = mddev->dev_sectors; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || - test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { conf->cluster_sync_low = 0; conf->cluster_sync_high = 0; @@ -3980,12 +3972,17 @@ static int raid10_set_queue_limits(struct mddev *mddev) { struct r10conf *conf = mddev->private; struct queue_limits lim; + int err; - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2bd1ce9b3922..c14cf2410365 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -155,7 +155,7 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void print_raid5_conf (struct r5conf *conf); +static void print_raid5_conf(struct r5conf *conf); static int stripe_operations_active(struct stripe_head *sh) { @@ -5899,6 +5899,39 @@ out: return ret; } +enum reshape_loc { + LOC_NO_RESHAPE, + LOC_AHEAD_OF_RESHAPE, + LOC_INSIDE_RESHAPE, + LOC_BEHIND_RESHAPE, +}; + +static enum reshape_loc get_reshape_loc(struct mddev *mddev, + struct r5conf *conf, sector_t logical_sector) +{ + sector_t reshape_progress, reshape_safe; + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + reshape_progress = conf->reshape_progress; + reshape_safe = conf->reshape_safe; + spin_unlock_irq(&conf->device_lock); + if (reshape_progress == MaxSector) + return LOC_NO_RESHAPE; + if (ahead_of_reshape(mddev, logical_sector, reshape_progress)) + return LOC_AHEAD_OF_RESHAPE; + if (ahead_of_reshape(mddev, logical_sector, reshape_safe)) + return LOC_INSIDE_RESHAPE; + return LOC_BEHIND_RESHAPE; +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5913,28 +5946,14 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, seq = read_seqcount_begin(&conf->gen_lock); if (unlikely(conf->reshape_progress != MaxSector)) { - /* - * Spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) { - previous = 1; - } else { - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_safe)) { - spin_unlock_irq(&conf->device_lock); - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; - } + enum reshape_loc loc = get_reshape_loc(mddev, conf, + logical_sector); + if (loc == LOC_INSIDE_RESHAPE) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } - spin_unlock_irq(&conf->device_lock); + if (loc == LOC_AHEAD_OF_RESHAPE) + previous = 1; } new_sector = raid5_compute_sector(conf, logical_sector, previous, @@ -6078,8 +6097,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } - if (!md_write_start(mddev, bi)) - return false; + md_write_start(mddev, bi); /* * If array is degraded, better not do chunk aligned read because * later we might have to read it again in order to reconstruct @@ -6113,8 +6131,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && - !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && - ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { + get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) { bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); @@ -6255,7 +6272,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->reshape_backwards) { - BUG_ON(writepos < reshape_sectors); + if (WARN_ON(writepos < reshape_sectors)) + return MaxSector; + writepos -= reshape_sectors; readpos += reshape_sectors; safepos += reshape_sectors; @@ -6273,14 +6292,18 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * to set 'stripe_addr' which is where we will write to. */ if (mddev->reshape_backwards) { - BUG_ON(conf->reshape_progress == 0); + if (WARN_ON(conf->reshape_progress == 0)) + return MaxSector; + stripe_addr = writepos; - BUG_ON((mddev->dev_sectors & - ~((sector_t)reshape_sectors - 1)) - - reshape_sectors - stripe_addr - != sector_nr); + if (WARN_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) - + reshape_sectors - stripe_addr != sector_nr)) + return MaxSector; } else { - BUG_ON(writepos != sector_nr + reshape_sectors); + if (WARN_ON(writepos != sector_nr + reshape_sectors)) + return MaxSector; + stripe_addr = sector_nr; } @@ -6458,11 +6481,10 @@ ret: } static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped) + sector_t max_sector, int *skipped) { struct r5conf *conf = mddev->private; struct stripe_head *sh; - sector_t max_sector = mddev->dev_sectors; sector_t sync_blocks; int still_degraded = 0; int i; @@ -7082,12 +7104,14 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) err = -ENODEV; else if (new != conf->skip_copy) { struct request_queue *q = mddev->gendisk->queue; + struct queue_limits lim = queue_limits_start_update(q); conf->skip_copy = new; if (new) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); + lim.features |= BLK_FEAT_STABLE_WRITES; else - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); + lim.features &= ~BLK_FEAT_STABLE_WRITES; + err = queue_limits_commit_update(q, &lim); } mddev_unlock_and_resume(mddev); return err ?: len; @@ -7562,11 +7586,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (test_bit(Replacement, &rdev->flags)) { if (disk->replacement) goto abort; - RCU_INIT_POINTER(disk->replacement, rdev); + disk->replacement = rdev; } else { if (disk->rdev) goto abort; - RCU_INIT_POINTER(disk->rdev, rdev); + disk->rdev = rdev; } if (test_bit(In_sync, &rdev->flags)) { @@ -7702,13 +7726,13 @@ static int raid5_set_limits(struct mddev *mddev) */ stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); - blk_set_stacking_limits(&lim); + md_init_stacking_limits(&lim); lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); - lim.raid_partial_stripes_expensive = 1; + lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; - mddev_stack_rdev_limits(mddev, &lim); + mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, mddev->gendisk->disk_name); @@ -8048,7 +8072,7 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) seq_printf (seq, "]"); } -static void print_raid5_conf (struct r5conf *conf) +static void print_raid5_conf(struct r5conf *conf) { struct md_rdev *rdev; int i; @@ -8062,15 +8086,13 @@ static void print_raid5_conf (struct r5conf *conf) conf->raid_disks, conf->raid_disks - conf->mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - rdev = rcu_dereference(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (rdev) pr_debug(" disk %d, o:%d, dev:%pg\n", i, !test_bit(Faulty, &rdev->flags), rdev->bdev); } - rcu_read_unlock(); } static int raid5_spare_active(struct mddev *mddev) diff --git a/drivers/media/pci/intel/ipu6/ipu6-isys-video.c b/drivers/media/pci/intel/ipu6/ipu6-isys-video.c index c8a33e1e910c..06090cc0a476 100644 --- a/drivers/media/pci/intel/ipu6/ipu6-isys-video.c +++ b/drivers/media/pci/intel/ipu6/ipu6-isys-video.c @@ -943,7 +943,7 @@ ipu6_isys_query_stream_by_source(struct ipu6_isys *isys, int source, u8 vc) return NULL; if (source < 0) { - dev_err(&stream->isys->adev->auxdev.dev, + dev_err(&isys->adev->auxdev.dev, "query stream with invalid port number\n"); return NULL; } diff --git a/drivers/media/pci/intel/ipu6/ipu6-isys.c b/drivers/media/pci/intel/ipu6/ipu6-isys.c index 8b9b77719bb1..c4aff2e2009b 100644 --- a/drivers/media/pci/intel/ipu6/ipu6-isys.c +++ b/drivers/media/pci/intel/ipu6/ipu6-isys.c @@ -799,7 +799,7 @@ static int isys_register_devices(struct ipu6_isys *isys) isys->v4l2_dev.mdev = &isys->media_dev; isys->v4l2_dev.ctrl_handler = NULL; - ret = v4l2_device_register(&pdev->dev, &isys->v4l2_dev); + ret = v4l2_device_register(dev, &isys->v4l2_dev); if (ret < 0) goto out_media_device_unregister; diff --git a/drivers/media/pci/intel/ivsc/Kconfig b/drivers/media/pci/intel/ivsc/Kconfig index 407a800c81bc..a7d9607ecdc6 100644 --- a/drivers/media/pci/intel/ivsc/Kconfig +++ b/drivers/media/pci/intel/ivsc/Kconfig @@ -4,6 +4,7 @@ config INTEL_VSC tristate "Intel Visual Sensing Controller" depends on INTEL_MEI && ACPI && VIDEO_DEV + depends on IPU_BRIDGE || !IPU_BRIDGE select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API select V4L2_FWNODE diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 4c67e2c5a82e..a7a2bcedb37e 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -1238,6 +1238,7 @@ static int fastrpc_init_create_static_process(struct fastrpc_user *fl, struct fastrpc_phy_page pages[1]; char *name; int err; + bool scm_done = false; struct { int pgid; u32 namelen; @@ -1289,6 +1290,7 @@ static int fastrpc_init_create_static_process(struct fastrpc_user *fl, fl->cctx->remote_heap->phys, fl->cctx->remote_heap->size, err); goto err_map; } + scm_done = true; } } @@ -1320,10 +1322,11 @@ static int fastrpc_init_create_static_process(struct fastrpc_user *fl, goto err_invoke; kfree(args); + kfree(name); return 0; err_invoke: - if (fl->cctx->vmcount) { + if (fl->cctx->vmcount && scm_done) { u64 src_perms = 0; struct qcom_scm_vmperm dst_perms; u32 i; @@ -1693,16 +1696,20 @@ static int fastrpc_get_info_from_dsp(struct fastrpc_user *fl, uint32_t *dsp_attr { struct fastrpc_invoke_args args[2] = { 0 }; - /* Capability filled in userspace */ + /* + * Capability filled in userspace. This carries the information + * about the remoteproc support which is fetched from the remoteproc + * sysfs node by userspace. + */ dsp_attr_buf[0] = 0; + dsp_attr_buf_len -= 1; args[0].ptr = (u64)(uintptr_t)&dsp_attr_buf_len; args[0].length = sizeof(dsp_attr_buf_len); args[0].fd = -1; args[1].ptr = (u64)(uintptr_t)&dsp_attr_buf[1]; - args[1].length = dsp_attr_buf_len; + args[1].length = dsp_attr_buf_len * sizeof(u32); args[1].fd = -1; - fl->pd = USER_PD; return fastrpc_internal_invoke(fl, true, FASTRPC_DSP_UTILITIES_HANDLE, FASTRPC_SCALARS(0, 1, 1), args); @@ -1730,7 +1737,7 @@ static int fastrpc_get_info_from_kernel(struct fastrpc_ioctl_capability *cap, if (!dsp_attributes) return -ENOMEM; - err = fastrpc_get_info_from_dsp(fl, dsp_attributes, FASTRPC_MAX_DSP_ATTRIBUTES_LEN); + err = fastrpc_get_info_from_dsp(fl, dsp_attributes, FASTRPC_MAX_DSP_ATTRIBUTES); if (err == DSP_UNSUPPORTED_API) { dev_info(&cctx->rpdev->dev, "Warning: DSP capabilities not supported on domain: %d\n", domain); @@ -1783,7 +1790,7 @@ static int fastrpc_get_dsp_info(struct fastrpc_user *fl, char __user *argp) if (err) return err; - if (copy_to_user(argp, &cap.capability, sizeof(cap.capability))) + if (copy_to_user(argp, &cap, sizeof(cap))) return -EFAULT; return 0; @@ -2080,6 +2087,16 @@ err_invoke: return err; } +static int is_attach_rejected(struct fastrpc_user *fl) +{ + /* Check if the device node is non-secure */ + if (!fl->is_secure_dev) { + dev_dbg(&fl->cctx->rpdev->dev, "untrusted app trying to attach to privileged DSP PD\n"); + return -EACCES; + } + return 0; +} + static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2092,13 +2109,19 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, err = fastrpc_invoke(fl, argp); break; case FASTRPC_IOCTL_INIT_ATTACH: - err = fastrpc_init_attach(fl, ROOT_PD); + err = is_attach_rejected(fl); + if (!err) + err = fastrpc_init_attach(fl, ROOT_PD); break; case FASTRPC_IOCTL_INIT_ATTACH_SNS: - err = fastrpc_init_attach(fl, SENSORS_PD); + err = is_attach_rejected(fl); + if (!err) + err = fastrpc_init_attach(fl, SENSORS_PD); break; case FASTRPC_IOCTL_INIT_CREATE_STATIC: - err = fastrpc_init_create_static_process(fl, argp); + err = is_attach_rejected(fl); + if (!err) + err = fastrpc_init_create_static_process(fl, argp); break; case FASTRPC_IOCTL_INIT_CREATE: err = fastrpc_init_create_process(fl, argp); diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c index 16695cb5e69c..7c3d8bedf90b 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c @@ -153,7 +153,6 @@ static int pci1xxxx_eeprom_read(void *priv_t, unsigned int off, buf[byte] = readl(rb + MMAP_EEPROM_OFFSET(EEPROM_DATA_REG)); } - ret = byte; error: release_sys_lock(priv); return ret; @@ -197,7 +196,6 @@ static int pci1xxxx_eeprom_write(void *priv_t, unsigned int off, goto error; } } - ret = byte; error: release_sys_lock(priv); return ret; @@ -258,7 +256,6 @@ static int pci1xxxx_otp_read(void *priv_t, unsigned int off, buf[byte] = readl(rb + MMAP_OTP_OFFSET(OTP_RD_DATA_OFFSET)); } - ret = byte; error: release_sys_lock(priv); return ret; @@ -315,7 +312,6 @@ static int pci1xxxx_otp_write(void *priv_t, unsigned int off, goto error; } } - ret = byte; error: release_sys_lock(priv); return ret; diff --git a/drivers/misc/mei/platform-vsc.c b/drivers/misc/mei/platform-vsc.c index 1ec65d87488a..d02f6e881139 100644 --- a/drivers/misc/mei/platform-vsc.c +++ b/drivers/misc/mei/platform-vsc.c @@ -28,8 +28,8 @@ #define MEI_VSC_MAX_MSG_SIZE 512 -#define MEI_VSC_POLL_DELAY_US (50 * USEC_PER_MSEC) -#define MEI_VSC_POLL_TIMEOUT_US (200 * USEC_PER_MSEC) +#define MEI_VSC_POLL_DELAY_US (100 * USEC_PER_MSEC) +#define MEI_VSC_POLL_TIMEOUT_US (400 * USEC_PER_MSEC) #define mei_dev_to_vsc_hw(dev) ((struct mei_vsc_hw *)((dev)->hw)) diff --git a/drivers/misc/mei/vsc-fw-loader.c b/drivers/misc/mei/vsc-fw-loader.c index 596a9d695dfc..084d0205f97d 100644 --- a/drivers/misc/mei/vsc-fw-loader.c +++ b/drivers/misc/mei/vsc-fw-loader.c @@ -204,7 +204,7 @@ struct vsc_img_frag { /** * struct vsc_fw_loader - represent vsc firmware loader - * @dev: device used to request fimware + * @dev: device used to request firmware * @tp: transport layer used with the firmware loader * @csi: CSI image * @ace: ACE image diff --git a/drivers/misc/mei/vsc-tp.c b/drivers/misc/mei/vsc-tp.c index e6a98dba8a73..1618cca9a731 100644 --- a/drivers/misc/mei/vsc-tp.c +++ b/drivers/misc/mei/vsc-tp.c @@ -331,12 +331,12 @@ int vsc_tp_rom_xfer(struct vsc_tp *tp, const void *obuf, void *ibuf, size_t len) return ret; } - ret = vsc_tp_dev_xfer(tp, tp->tx_buf, tp->rx_buf, len); + ret = vsc_tp_dev_xfer(tp, tp->tx_buf, ibuf ? tp->rx_buf : NULL, len); if (ret) return ret; if (ibuf) - cpu_to_be32_array(ibuf, tp->rx_buf, words); + be32_to_cpu_array(ibuf, tp->rx_buf, words); return ret; } @@ -568,6 +568,19 @@ static void vsc_tp_remove(struct spi_device *spi) free_irq(spi->irq, tp); } +static void vsc_tp_shutdown(struct spi_device *spi) +{ + struct vsc_tp *tp = spi_get_drvdata(spi); + + platform_device_unregister(tp->pdev); + + mutex_destroy(&tp->mutex); + + vsc_tp_reset(tp); + + free_irq(spi->irq, tp); +} + static const struct acpi_device_id vsc_tp_acpi_ids[] = { { "INTC1009" }, /* Raptor Lake */ { "INTC1058" }, /* Tiger Lake */ @@ -580,6 +593,7 @@ MODULE_DEVICE_TABLE(acpi, vsc_tp_acpi_ids); static struct spi_driver vsc_tp_driver = { .probe = vsc_tp_probe, .remove = vsc_tp_remove, + .shutdown = vsc_tp_shutdown, .driver = { .name = "vsc-tp", .acpi_match_table = vsc_tp_acpi_ids, diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 367509b5b646..2c9963248fcb 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2466,8 +2466,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, struct mmc_blk_data *md; int devidx, ret; char cap_str[10]; - bool cache_enabled = false; - bool fua_enabled = false; + unsigned int features = 0; devidx = ida_alloc_max(&mmc_blk_ida, max_devices - 1, GFP_KERNEL); if (devidx < 0) { @@ -2499,7 +2498,24 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, */ md->read_only = mmc_blk_readonly(card); - md->disk = mmc_init_queue(&md->queue, card); + if (mmc_host_cmd23(card->host)) { + if ((mmc_card_mmc(card) && + card->csd.mmca_vsn >= CSD_SPEC_VER_3) || + (mmc_card_sd(card) && + card->scr.cmds & SD_SCR_CMD23_SUPPORT)) + md->flags |= MMC_BLK_CMD23; + } + + if (md->flags & MMC_BLK_CMD23 && + ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || + card->ext_csd.rel_sectors)) { + md->flags |= MMC_BLK_REL_WR; + features |= (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); + } else if (mmc_cache_enabled(card->host)) { + features |= BLK_FEAT_WRITE_CACHE; + } + + md->disk = mmc_init_queue(&md->queue, card, features); if (IS_ERR(md->disk)) { ret = PTR_ERR(md->disk); goto err_kfree; @@ -2539,26 +2555,6 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, set_capacity(md->disk, size); - if (mmc_host_cmd23(card->host)) { - if ((mmc_card_mmc(card) && - card->csd.mmca_vsn >= CSD_SPEC_VER_3) || - (mmc_card_sd(card) && - card->scr.cmds & SD_SCR_CMD23_SUPPORT)) - md->flags |= MMC_BLK_CMD23; - } - - if (md->flags & MMC_BLK_CMD23 && - ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) || - card->ext_csd.rel_sectors)) { - md->flags |= MMC_BLK_REL_WR; - fua_enabled = true; - cache_enabled = true; - } - if (mmc_cache_enabled(card->host)) - cache_enabled = true; - - blk_queue_write_cache(md->queue.queue, cache_enabled, fua_enabled); - string_get_size((u64)size, 512, STRING_UNITS_2, cap_str, sizeof(cap_str)); pr_info("%s: %s %s %s%s\n", diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 241cdc2b2a2a..d0b3ca8a11f0 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -344,10 +344,12 @@ static const struct blk_mq_ops mmc_mq_ops = { }; static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq, - struct mmc_card *card) + struct mmc_card *card, unsigned int features) { struct mmc_host *host = card->host; - struct queue_limits lim = { }; + struct queue_limits lim = { + .features = features, + }; struct gendisk *disk; if (mmc_can_erase(card)) @@ -376,18 +378,16 @@ static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq, lim.max_segments = host->max_segs; } + if (mmc_host_is_spi(host) && host->use_spi_crc) + lim.features |= BLK_FEAT_STABLE_WRITES; + disk = blk_mq_alloc_disk(&mq->tag_set, &lim, mq); if (IS_ERR(disk)) return disk; mq->queue = disk->queue; - if (mmc_host_is_spi(host) && host->use_spi_crc) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); blk_queue_rq_timeout(mq->queue, 60 * HZ); - blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue); - dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); INIT_WORK(&mq->recovery_work, mmc_mq_recovery_handler); @@ -413,10 +413,12 @@ static inline bool mmc_merge_capable(struct mmc_host *host) * mmc_init_queue - initialise a queue structure. * @mq: mmc queue * @card: mmc card to attach this queue + * @features: block layer features (BLK_FEAT_*) * * Initialise a MMC card request queue. */ -struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) +struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, + unsigned int features) { struct mmc_host *host = card->host; struct gendisk *disk; @@ -460,7 +462,7 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) return ERR_PTR(ret); - disk = mmc_alloc_disk(mq, card); + disk = mmc_alloc_disk(mq, card, features); if (IS_ERR(disk)) blk_mq_free_tag_set(&mq->tag_set); return disk; diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h index 9ade3bcbb714..1498840a4ea0 100644 --- a/drivers/mmc/core/queue.h +++ b/drivers/mmc/core/queue.h @@ -94,7 +94,8 @@ struct mmc_queue { struct work_struct complete_work; }; -struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card); +struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, + unsigned int features); extern void mmc_cleanup_queue(struct mmc_queue *); extern void mmc_queue_suspend(struct mmc_queue *); extern void mmc_queue_resume(struct mmc_queue *); diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index d7427894e0bc..c302eb380e42 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -224,6 +224,9 @@ static void davinci_fifo_data_trans(struct mmc_davinci_host *host, } p = sgm->addr; + if (n > sgm->length) + n = sgm->length; + /* NOTE: we never transfer more than rw_threshold bytes * to/from the fifo here; there's no I/O overlap. * This also assumes that access width( i.e. ACCWD) is 4 bytes diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 112584aa0772..fbf7a91bed35 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -4727,6 +4727,21 @@ int sdhci_setup_host(struct sdhci_host *host) if (host->quirks & SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC) { host->max_adma = 65532; /* 32-bit alignment */ mmc->max_seg_size = 65535; + /* + * sdhci_adma_table_pre() expects to define 1 DMA + * descriptor per segment, so the maximum segment size + * is set accordingly. SDHCI allows up to 64KiB per DMA + * descriptor (16-bit field), but some controllers do + * not support "zero means 65536" reducing the maximum + * for them to 65535. That is a problem if PAGE_SIZE is + * 64KiB because the block layer does not support + * max_seg_size < PAGE_SIZE, however + * sdhci_adma_table_pre() has a workaround to handle + * that case, and split the descriptor. Refer also + * comment in sdhci_adma_table_pre(). + */ + if (mmc->max_seg_size < PAGE_SIZE) + mmc->max_seg_size = PAGE_SIZE; } else { mmc->max_seg_size = 65536; } diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 3caa0717d46c..47ead84407cd 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -336,6 +336,8 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) lim.logical_block_size = tr->blksize; if (tr->discard) lim.max_hw_discard_sectors = UINT_MAX; + if (tr->flush) + lim.features |= BLK_FEAT_WRITE_CACHE; /* Create gendisk */ gd = blk_mq_alloc_disk(new->tag_set, &lim, new); @@ -372,13 +374,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) /* Create the request queue */ spin_lock_init(&new->queue_lock); INIT_LIST_HEAD(&new->rq_list); - - if (tr->flush) - blk_queue_write_cache(new->rq, true, false); - - blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); - gd->queue = new->rq; if (new->readonly) diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index d7dbbd469b89..53e16d39af4b 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -1093,28 +1093,32 @@ static int nand_fill_column_cycles(struct nand_chip *chip, u8 *addrs, unsigned int offset_in_page) { struct mtd_info *mtd = nand_to_mtd(chip); + bool ident_stage = !mtd->writesize; - /* Make sure the offset is less than the actual page size. */ - if (offset_in_page > mtd->writesize + mtd->oobsize) - return -EINVAL; + /* Bypass all checks during NAND identification */ + if (likely(!ident_stage)) { + /* Make sure the offset is less than the actual page size. */ + if (offset_in_page > mtd->writesize + mtd->oobsize) + return -EINVAL; - /* - * On small page NANDs, there's a dedicated command to access the OOB - * area, and the column address is relative to the start of the OOB - * area, not the start of the page. Asjust the address accordingly. - */ - if (mtd->writesize <= 512 && offset_in_page >= mtd->writesize) - offset_in_page -= mtd->writesize; + /* + * On small page NANDs, there's a dedicated command to access the OOB + * area, and the column address is relative to the start of the OOB + * area, not the start of the page. Asjust the address accordingly. + */ + if (mtd->writesize <= 512 && offset_in_page >= mtd->writesize) + offset_in_page -= mtd->writesize; - /* - * The offset in page is expressed in bytes, if the NAND bus is 16-bit - * wide, then it must be divided by 2. - */ - if (chip->options & NAND_BUSWIDTH_16) { - if (WARN_ON(offset_in_page % 2)) - return -EINVAL; + /* + * The offset in page is expressed in bytes, if the NAND bus is 16-bit + * wide, then it must be divided by 2. + */ + if (chip->options & NAND_BUSWIDTH_16) { + if (WARN_ON(offset_in_page % 2)) + return -EINVAL; - offset_in_page /= 2; + offset_in_page /= 2; + } } addrs[0] = offset_in_page; @@ -1123,7 +1127,7 @@ static int nand_fill_column_cycles(struct nand_chip *chip, u8 *addrs, * Small page NANDs use 1 cycle for the columns, while large page NANDs * need 2 */ - if (mtd->writesize <= 512) + if (!ident_stage && mtd->writesize <= 512) return 1; addrs[1] = offset_in_page >> 8; @@ -1436,16 +1440,19 @@ int nand_change_read_column_op(struct nand_chip *chip, unsigned int len, bool force_8bit) { struct mtd_info *mtd = nand_to_mtd(chip); + bool ident_stage = !mtd->writesize; if (len && !buf) return -EINVAL; - if (offset_in_page + len > mtd->writesize + mtd->oobsize) - return -EINVAL; + if (!ident_stage) { + if (offset_in_page + len > mtd->writesize + mtd->oobsize) + return -EINVAL; - /* Small page NANDs do not support column change. */ - if (mtd->writesize <= 512) - return -ENOTSUPP; + /* Small page NANDs do not support column change. */ + if (mtd->writesize <= 512) + return -ENOTSUPP; + } if (nand_has_exec_op(chip)) { const struct nand_interface_config *conf = @@ -2173,7 +2180,7 @@ EXPORT_SYMBOL_GPL(nand_reset_op); int nand_read_data_op(struct nand_chip *chip, void *buf, unsigned int len, bool force_8bit, bool check_only) { - if (!len || !buf) + if (!len || (!check_only && !buf)) return -EINVAL; if (nand_has_exec_op(chip)) { @@ -6301,6 +6308,7 @@ static const struct nand_ops rawnand_ops = { static int nand_scan_tail(struct nand_chip *chip) { struct mtd_info *mtd = nand_to_mtd(chip); + struct nand_device *base = &chip->base; struct nand_ecc_ctrl *ecc = &chip->ecc; int ret, i; @@ -6445,9 +6453,13 @@ static int nand_scan_tail(struct nand_chip *chip) if (!ecc->write_oob_raw) ecc->write_oob_raw = ecc->write_oob; - /* propagate ecc info to mtd_info */ + /* Propagate ECC info to the generic NAND and MTD layers */ mtd->ecc_strength = ecc->strength; + if (!base->ecc.ctx.conf.strength) + base->ecc.ctx.conf.strength = ecc->strength; mtd->ecc_step_size = ecc->size; + if (!base->ecc.ctx.conf.step_size) + base->ecc.ctx.conf.step_size = ecc->size; /* * Set the number of read / write steps for one page depending on ECC @@ -6455,6 +6467,8 @@ static int nand_scan_tail(struct nand_chip *chip) */ if (!ecc->steps) ecc->steps = mtd->writesize / ecc->size; + if (!base->ecc.ctx.nsteps) + base->ecc.ctx.nsteps = ecc->steps; if (ecc->steps * ecc->size != mtd->writesize) { WARN(1, "Invalid ECC parameters\n"); ret = -EINVAL; diff --git a/drivers/mtd/nand/raw/rockchip-nand-controller.c b/drivers/mtd/nand/raw/rockchip-nand-controller.c index 7baaef69d70a..55580447633b 100644 --- a/drivers/mtd/nand/raw/rockchip-nand-controller.c +++ b/drivers/mtd/nand/raw/rockchip-nand-controller.c @@ -420,13 +420,13 @@ static int rk_nfc_setup_interface(struct nand_chip *chip, int target, u32 rate, tc2rw, trwpw, trw2c; u32 temp; - if (target < 0) - return 0; - timings = nand_get_sdr_timings(conf); if (IS_ERR(timings)) return -EOPNOTSUPP; + if (target < 0) + return 0; + if (IS_ERR(nfc->nfc_clk)) rate = clk_get_rate(nfc->ahb_clk); else diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 0cacd7027e35..bc80fb6397dc 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1214,9 +1214,9 @@ static int bond_option_arp_ip_targets_set(struct bonding *bond, __be32 target; if (newval->string) { - if (!in4_pton(newval->string+1, -1, (u8 *)&target, -1, NULL)) { - netdev_err(bond->dev, "invalid ARP target %pI4 specified\n", - &target); + if (strlen(newval->string) < 1 || + !in4_pton(newval->string + 1, -1, (u8 *)&target, -1, NULL)) { + netdev_err(bond->dev, "invalid ARP target specified\n"); return ret; } if (newval->string[0] == '+') diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index 7292c81fc0cd..024169461cad 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -125,6 +125,7 @@ static const struct kvaser_usb_driver_info kvaser_usb_driver_info_leaf_err_liste static const struct kvaser_usb_driver_info kvaser_usb_driver_info_leafimx = { .quirks = 0, + .family = KVASER_LEAF, .ops = &kvaser_usb_leaf_dev_ops, }; diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index 02f07b870f10..268949939636 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -1047,31 +1047,31 @@ static int lan9303_get_sset_count(struct dsa_switch *ds, int port, int sset) return ARRAY_SIZE(lan9303_mib); } -static int lan9303_phy_read(struct dsa_switch *ds, int phy, int regnum) +static int lan9303_phy_read(struct dsa_switch *ds, int port, int regnum) { struct lan9303 *chip = ds->priv; int phy_base = chip->phy_addr_base; - if (phy == phy_base) + if (port == 0) return lan9303_virt_phy_reg_read(chip, regnum); - if (phy > phy_base + 2) + if (port > 2) return -ENODEV; - return chip->ops->phy_read(chip, phy, regnum); + return chip->ops->phy_read(chip, phy_base + port, regnum); } -static int lan9303_phy_write(struct dsa_switch *ds, int phy, int regnum, +static int lan9303_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val) { struct lan9303 *chip = ds->priv; int phy_base = chip->phy_addr_base; - if (phy == phy_base) + if (port == 0) return lan9303_virt_phy_reg_write(chip, regnum, val); - if (phy > phy_base + 2) + if (port > 2) return -ENODEV; - return chip->ops->phy_write(chip, phy, regnum, val); + return chip->ops->phy_write(chip, phy_base + port, regnum, val); } static int lan9303_port_enable(struct dsa_switch *ds, int port, @@ -1099,7 +1099,7 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port) vlan_vid_del(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port); lan9303_disable_processing_port(chip, port); - lan9303_phy_write(ds, chip->phy_addr_base + port, MII_BMCR, BMCR_PDOWN); + lan9303_phy_write(ds, port, MII_BMCR, BMCR_PDOWN); } static int lan9303_port_bridge_join(struct dsa_switch *ds, int port, @@ -1374,8 +1374,6 @@ static const struct dsa_switch_ops lan9303_switch_ops = { static int lan9303_register_switch(struct lan9303 *chip) { - int base; - chip->ds = devm_kzalloc(chip->dev, sizeof(*chip->ds), GFP_KERNEL); if (!chip->ds) return -ENOMEM; @@ -1385,8 +1383,7 @@ static int lan9303_register_switch(struct lan9303 *chip) chip->ds->priv = chip; chip->ds->ops = &lan9303_switch_ops; chip->ds->phylink_mac_ops = &lan9303_phylink_mac_ops; - base = chip->phy_addr_base; - chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1 + base, base); + chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1, 0); return dsa_register_switch(chip->ds); } diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.c b/drivers/net/ethernet/broadcom/asp2/bcmasp.c index a806dadc4196..20c6529ec135 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.c @@ -1380,6 +1380,7 @@ static int bcmasp_probe(struct platform_device *pdev) dev_err(dev, "Cannot create eth interface %d\n", i); bcmasp_remove_intfs(priv); of_node_put(intf_node); + ret = -ENOMEM; goto of_put_exit; } list_add_tail(&intf->list, &priv->intfs); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h index e2a4e1088b7f..9580ab83d387 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h @@ -1262,7 +1262,7 @@ enum { struct bnx2x_fw_stats_req { struct stats_query_header hdr; - struct stats_query_entry query[FP_SB_MAX_E1x+ + struct stats_query_entry query[FP_SB_MAX_E2 + BNX2X_FIRST_QUEUE_QUERY_IDX]; }; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a6d69a45fa01..43952689bfb0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6146,6 +6146,24 @@ static u16 bnxt_get_max_rss_ring(struct bnxt *bp) return max_ring; } +u16 bnxt_get_max_rss_ctx_ring(struct bnxt *bp) +{ + u16 i, tbl_size, max_ring = 0; + struct bnxt_rss_ctx *rss_ctx; + + if (!BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) + return 0; + + tbl_size = bnxt_get_rxfh_indir_size(bp->dev); + + list_for_each_entry(rss_ctx, &bp->rss_ctx_list, list) { + for (i = 0; i < tbl_size; i++) + max_ring = max(max_ring, rss_ctx->rss_indir_tbl[i]); + } + + return max_ring; +} + int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings) { if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) { @@ -12669,7 +12687,11 @@ bool bnxt_rfs_capable(struct bnxt *bp, bool new_rss_ctx) if (!BNXT_NEW_RM(bp)) return true; - if (hwr.vnic == bp->hw_resc.resv_vnics && + /* Do not reduce VNIC and RSS ctx reservations. There is a FW + * issue that will mess up the default VNIC if we reduce the + * reservations. + */ + if (hwr.vnic <= bp->hw_resc.resv_vnics && hwr.rss_ctx <= bp->hw_resc.resv_rsscos_ctxs) return true; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 9cf0acfa04e5..6b10a09ee1af 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2776,6 +2776,7 @@ int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, struct bnxt_vnic_info *vnic, void bnxt_fill_ipv6_mask(__be32 mask[4]); int bnxt_alloc_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); +u16 bnxt_get_max_rss_ctx_ring(struct bnxt *bp); int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings); int bnxt_hwrm_vnic_cfg(struct bnxt *bp, struct bnxt_vnic_info *vnic); int bnxt_hwrm_vnic_alloc(struct bnxt *bp, struct bnxt_vnic_info *vnic, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 8763f8a01457..79c09c1cdf93 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -961,6 +961,12 @@ static int bnxt_set_channels(struct net_device *dev, return rc; } + if (req_rx_rings < bp->rx_nr_rings && + req_rx_rings <= bnxt_get_max_rss_ctx_ring(bp)) { + netdev_warn(dev, "Can't deactivate rings used by RSS contexts\n"); + return -EINVAL; + } + if (bnxt_get_nr_rss_ctxs(bp, req_rx_rings) != bnxt_get_nr_rss_ctxs(bp, bp->rx_nr_rings) && netif_is_rxfh_configured(dev)) { diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 2e98a2a0bead..ce227b56cf72 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1109,6 +1109,46 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link) } /** + * e1000e_force_smbus - Force interfaces to transition to SMBUS mode. + * @hw: pointer to the HW structure + * + * Force the MAC and the PHY to SMBUS mode. Assumes semaphore already + * acquired. + * + * Return: 0 on success, negative errno on failure. + **/ +static s32 e1000e_force_smbus(struct e1000_hw *hw) +{ + u16 smb_ctrl = 0; + u32 ctrl_ext; + s32 ret_val; + + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + + /* Force SMBus mode in the PHY */ + ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &smb_ctrl); + if (ret_val) { + e1000e_enable_phy_retry(hw); + return ret_val; + } + + smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS; + e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, smb_ctrl); + + e1000e_enable_phy_retry(hw); + + /* Force SMBus mode in the MAC */ + ctrl_ext = er32(CTRL_EXT); + ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS; + ew32(CTRL_EXT, ctrl_ext); + + return 0; +} + +/** * e1000_enable_ulp_lpt_lp - configure Ultra Low Power mode for LynxPoint-LP * @hw: pointer to the HW structure * @to_sx: boolean indicating a system power state transition to Sx @@ -1165,6 +1205,14 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; + if (hw->mac.type != e1000_pch_mtp) { + ret_val = e1000e_force_smbus(hw); + if (ret_val) { + e_dbg("Failed to force SMBUS: %d\n", ret_val); + goto release; + } + } + /* Si workaround for ULP entry flow on i127/rev6 h/w. Enable * LPLU and disable Gig speed when entering ULP */ @@ -1225,27 +1273,12 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) } release: - /* Switching PHY interface always returns MDI error - * so disable retry mechanism to avoid wasting time - */ - e1000e_disable_phy_retry(hw); - - /* Force SMBus mode in PHY */ - ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); - if (ret_val) { - e1000e_enable_phy_retry(hw); - hw->phy.ops.release(hw); - goto out; + if (hw->mac.type == e1000_pch_mtp) { + ret_val = e1000e_force_smbus(hw); + if (ret_val) + e_dbg("Failed to force SMBUS over MTL system: %d\n", + ret_val); } - phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; - e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); - - e1000e_enable_phy_retry(hw); - - /* Force SMBus mode in MAC */ - mac_reg = er32(CTRL_EXT); - mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; - ew32(CTRL_EXT, mac_reg); hw->phy.ops.release(hw); out: diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index da5c59daf8ba..3cd161c6672b 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6363,49 +6363,49 @@ static void e1000e_s0ix_entry_flow(struct e1000_adapter *adapter) mac_data |= E1000_EXTCNF_CTRL_GATE_PHY_CFG; ew32(EXTCNF_CTRL, mac_data); - /* Enable the Dynamic Power Gating in the MAC */ - mac_data = er32(FEXTNVM7); - mac_data |= BIT(22); - ew32(FEXTNVM7, mac_data); - /* Disable disconnected cable conditioning for Power Gating */ mac_data = er32(DPGFR); mac_data |= BIT(2); ew32(DPGFR, mac_data); - /* Don't wake from dynamic Power Gating with clock request */ - mac_data = er32(FEXTNVM12); - mac_data |= BIT(12); - ew32(FEXTNVM12, mac_data); - - /* Ungate PGCB clock */ - mac_data = er32(FEXTNVM9); - mac_data &= ~BIT(28); - ew32(FEXTNVM9, mac_data); - - /* Enable K1 off to enable mPHY Power Gating */ - mac_data = er32(FEXTNVM6); - mac_data |= BIT(31); - ew32(FEXTNVM6, mac_data); - - /* Enable mPHY power gating for any link and speed */ - mac_data = er32(FEXTNVM8); - mac_data |= BIT(9); - ew32(FEXTNVM8, mac_data); - /* Enable the Dynamic Clock Gating in the DMA and MAC */ mac_data = er32(CTRL_EXT); mac_data |= E1000_CTRL_EXT_DMA_DYN_CLK_EN; ew32(CTRL_EXT, mac_data); - - /* No MAC DPG gating SLP_S0 in modern standby - * Switch the logic of the lanphypc to use PMC counter - */ - mac_data = er32(FEXTNVM5); - mac_data |= BIT(7); - ew32(FEXTNVM5, mac_data); } + /* Enable the Dynamic Power Gating in the MAC */ + mac_data = er32(FEXTNVM7); + mac_data |= BIT(22); + ew32(FEXTNVM7, mac_data); + + /* Don't wake from dynamic Power Gating with clock request */ + mac_data = er32(FEXTNVM12); + mac_data |= BIT(12); + ew32(FEXTNVM12, mac_data); + + /* Ungate PGCB clock */ + mac_data = er32(FEXTNVM9); + mac_data &= ~BIT(28); + ew32(FEXTNVM9, mac_data); + + /* Enable K1 off to enable mPHY Power Gating */ + mac_data = er32(FEXTNVM6); + mac_data |= BIT(31); + ew32(FEXTNVM6, mac_data); + + /* Enable mPHY power gating for any link and speed */ + mac_data = er32(FEXTNVM8); + mac_data |= BIT(9); + ew32(FEXTNVM8, mac_data); + + /* No MAC DPG gating SLP_S0 in modern standby + * Switch the logic of the lanphypc to use PMC counter + */ + mac_data = er32(FEXTNVM5); + mac_data |= BIT(7); + ew32(FEXTNVM5, mac_data); + /* Disable the time synchronization clock */ mac_data = er32(FEXTNVM7); mac_data |= BIT(31); @@ -6498,33 +6498,6 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) } else { /* Request driver unconfigure the device from S0ix */ - /* Disable the Dynamic Power Gating in the MAC */ - mac_data = er32(FEXTNVM7); - mac_data &= 0xFFBFFFFF; - ew32(FEXTNVM7, mac_data); - - /* Disable mPHY power gating for any link and speed */ - mac_data = er32(FEXTNVM8); - mac_data &= ~BIT(9); - ew32(FEXTNVM8, mac_data); - - /* Disable K1 off */ - mac_data = er32(FEXTNVM6); - mac_data &= ~BIT(31); - ew32(FEXTNVM6, mac_data); - - /* Disable Ungate PGCB clock */ - mac_data = er32(FEXTNVM9); - mac_data |= BIT(28); - ew32(FEXTNVM9, mac_data); - - /* Cancel not waking from dynamic - * Power Gating with clock request - */ - mac_data = er32(FEXTNVM12); - mac_data &= ~BIT(12); - ew32(FEXTNVM12, mac_data); - /* Cancel disable disconnected cable conditioning * for Power Gating */ @@ -6537,13 +6510,6 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) mac_data &= 0xFFF7FFFF; ew32(CTRL_EXT, mac_data); - /* Revert the lanphypc logic to use the internal Gbe counter - * and not the PMC counter - */ - mac_data = er32(FEXTNVM5); - mac_data &= 0xFFFFFF7F; - ew32(FEXTNVM5, mac_data); - /* Enable the periodic inband message, * Request PCIe clock in K1 page770_17[10:9] =01b */ @@ -6581,6 +6547,40 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) mac_data &= ~BIT(31); mac_data |= BIT(0); ew32(FEXTNVM7, mac_data); + + /* Disable the Dynamic Power Gating in the MAC */ + mac_data = er32(FEXTNVM7); + mac_data &= 0xFFBFFFFF; + ew32(FEXTNVM7, mac_data); + + /* Disable mPHY power gating for any link and speed */ + mac_data = er32(FEXTNVM8); + mac_data &= ~BIT(9); + ew32(FEXTNVM8, mac_data); + + /* Disable K1 off */ + mac_data = er32(FEXTNVM6); + mac_data &= ~BIT(31); + ew32(FEXTNVM6, mac_data); + + /* Disable Ungate PGCB clock */ + mac_data = er32(FEXTNVM9); + mac_data |= BIT(28); + ew32(FEXTNVM9, mac_data); + + /* Cancel not waking from dynamic + * Power Gating with clock request + */ + mac_data = er32(FEXTNVM12); + mac_data &= ~BIT(12); + ew32(FEXTNVM12, mac_data); + + /* Revert the lanphypc logic to use the internal Gbe counter + * and not the PMC counter + */ + mac_data = er32(FEXTNVM5); + mac_data &= 0xFFFFFF7F; + ew32(FEXTNVM5, mac_data); } static int e1000e_pm_freeze(struct device *dev) diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c index bbcfd529399b..89d57dd911dc 100644 --- a/drivers/net/ethernet/intel/e1000e/ptp.c +++ b/drivers/net/ethernet/intel/e1000e/ptp.c @@ -124,7 +124,8 @@ static int e1000e_phc_get_syncdevicetime(ktime_t *device, sys_cycles = er32(PLTSTMPH); sys_cycles <<= 32; sys_cycles |= er32(PLTSTMPL); - *system = convert_art_to_tsc(sys_cycles); + system->cycles = sys_cycles; + system->cs_id = CSID_X86_ART; return 0; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.h b/drivers/net/ethernet/intel/i40e/i40e_adminq.h index ee86d2c53079..55b5bb884d73 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.h @@ -109,10 +109,6 @@ static inline int i40e_aq_rc_to_posix(int aq_ret, int aq_rc) -EFBIG, /* I40E_AQ_RC_EFBIG */ }; - /* aq_rc is invalid if AQ timed out */ - if (aq_ret == -EIO) - return -EAGAIN; - if (!((u32)aq_rc < (sizeof(aq_to_posix) / sizeof((aq_to_posix)[0])))) return -ERANGE; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 284c3fad5a6e..310513d9321b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -13293,6 +13293,10 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, bool need_reset; int i; + /* VSI shall be deleted in a moment, block loading new programs */ + if (prog && test_bit(__I40E_IN_REMOVE, pf->state)) + return -EINVAL; + /* Don't allow frames that span over multiple buffers */ if (vsi->netdev->mtu > frame_size - I40E_PACKET_HDR_PAD) { NL_SET_ERR_MSG_MOD(extack, "MTU too large for linear frames and XDP prog does not support frags"); @@ -13301,14 +13305,9 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, /* When turning XDP on->off/off->on we reset and rebuild the rings. */ need_reset = (i40e_enabled_xdp_vsi(vsi) != !!prog); - if (need_reset) i40e_prep_for_reset(pf); - /* VSI shall be deleted in a moment, just return EINVAL */ - if (test_bit(__I40E_IN_REMOVE, pf->state)) - return -EINVAL; - old_prog = xchg(&vsi->xdp_prog, prog); if (need_reset) { diff --git a/drivers/net/ethernet/intel/ice/ice_hwmon.c b/drivers/net/ethernet/intel/ice/ice_hwmon.c index e4c2c1bff6c0..b7aa6812510a 100644 --- a/drivers/net/ethernet/intel/ice/ice_hwmon.c +++ b/drivers/net/ethernet/intel/ice/ice_hwmon.c @@ -96,7 +96,7 @@ static bool ice_is_internal_reading_supported(struct ice_pf *pf) unsigned long sensors = pf->hw.dev_caps.supported_sensors; - return _test_bit(ICE_SENSOR_SUPPORT_E810_INT_TEMP_BIT, &sensors); + return test_bit(ICE_SENSOR_SUPPORT_E810_INT_TEMP_BIT, &sensors); }; void ice_hwmon_init(struct ice_pf *pf) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 0f17fc1181d2..927b623cedd5 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1559,6 +1559,10 @@ void ice_ptp_extts_event(struct ice_pf *pf) u8 chan, tmr_idx; u32 hi, lo; + /* Don't process timestamp events if PTP is not ready */ + if (pf->ptp.state != ICE_PTP_READY) + return; + tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned; /* Event time is captured by one of the two matched registers * GLTSYN_EVNT_L: 32 LSB of sampled time event @@ -1584,27 +1588,33 @@ void ice_ptp_extts_event(struct ice_pf *pf) /** * ice_ptp_cfg_extts - Configure EXTTS pin and channel * @pf: Board private structure - * @ena: true to enable; false to disable * @chan: GPIO channel (0-3) - * @gpio_pin: GPIO pin - * @extts_flags: request flags from the ptp_extts_request.flags + * @config: desired EXTTS configuration. + * @store: If set to true, the values will be stored + * + * Configure an external timestamp event on the requested channel. + * + * Return: 0 on success, -EOPNOTUSPP on unsupported flags */ -static int -ice_ptp_cfg_extts(struct ice_pf *pf, bool ena, unsigned int chan, u32 gpio_pin, - unsigned int extts_flags) +static int ice_ptp_cfg_extts(struct ice_pf *pf, unsigned int chan, + struct ice_extts_channel *config, bool store) { u32 func, aux_reg, gpio_reg, irq_reg; struct ice_hw *hw = &pf->hw; u8 tmr_idx; - if (chan > (unsigned int)pf->ptp.info.n_ext_ts) - return -EINVAL; + /* Reject requests with unsupported flags */ + if (config->flags & ~(PTP_ENABLE_FEATURE | + PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS)) + return -EOPNOTSUPP; tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned; irq_reg = rd32(hw, PFINT_OICR_ENA); - if (ena) { + if (config->ena) { /* Enable the interrupt */ irq_reg |= PFINT_OICR_TSYN_EVNT_M; aux_reg = GLTSYN_AUX_IN_0_INT_ENA_M; @@ -1613,9 +1623,9 @@ ice_ptp_cfg_extts(struct ice_pf *pf, bool ena, unsigned int chan, u32 gpio_pin, #define GLTSYN_AUX_IN_0_EVNTLVL_FALLING_EDGE BIT(1) /* set event level to requested edge */ - if (extts_flags & PTP_FALLING_EDGE) + if (config->flags & PTP_FALLING_EDGE) aux_reg |= GLTSYN_AUX_IN_0_EVNTLVL_FALLING_EDGE; - if (extts_flags & PTP_RISING_EDGE) + if (config->flags & PTP_RISING_EDGE) aux_reg |= GLTSYN_AUX_IN_0_EVNTLVL_RISING_EDGE; /* Write GPIO CTL reg. @@ -1636,12 +1646,52 @@ ice_ptp_cfg_extts(struct ice_pf *pf, bool ena, unsigned int chan, u32 gpio_pin, wr32(hw, PFINT_OICR_ENA, irq_reg); wr32(hw, GLTSYN_AUX_IN(chan, tmr_idx), aux_reg); - wr32(hw, GLGEN_GPIO_CTL(gpio_pin), gpio_reg); + wr32(hw, GLGEN_GPIO_CTL(config->gpio_pin), gpio_reg); + + if (store) + memcpy(&pf->ptp.extts_channels[chan], config, sizeof(*config)); return 0; } /** + * ice_ptp_disable_all_extts - Disable all EXTTS channels + * @pf: Board private structure + */ +static void ice_ptp_disable_all_extts(struct ice_pf *pf) +{ + struct ice_extts_channel extts_cfg = {}; + int i; + + for (i = 0; i < pf->ptp.info.n_ext_ts; i++) { + if (pf->ptp.extts_channels[i].ena) { + extts_cfg.gpio_pin = pf->ptp.extts_channels[i].gpio_pin; + extts_cfg.ena = false; + ice_ptp_cfg_extts(pf, i, &extts_cfg, false); + } + } + + synchronize_irq(pf->oicr_irq.virq); +} + +/** + * ice_ptp_enable_all_extts - Enable all EXTTS channels + * @pf: Board private structure + * + * Called during reset to restore user configuration. + */ +static void ice_ptp_enable_all_extts(struct ice_pf *pf) +{ + int i; + + for (i = 0; i < pf->ptp.info.n_ext_ts; i++) { + if (pf->ptp.extts_channels[i].ena) + ice_ptp_cfg_extts(pf, i, &pf->ptp.extts_channels[i], + false); + } +} + +/** * ice_ptp_cfg_clkout - Configure clock to generate periodic wave * @pf: Board private structure * @chan: GPIO channel (0-3) @@ -1659,6 +1709,9 @@ static int ice_ptp_cfg_clkout(struct ice_pf *pf, unsigned int chan, u32 func, val, gpio_pin; u8 tmr_idx; + if (config && config->flags & ~PTP_PEROUT_PHASE) + return -EOPNOTSUPP; + tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned; /* 0. Reset mode & out_en in AUX_OUT */ @@ -1795,17 +1848,18 @@ ice_ptp_gpio_enable_e810(struct ptp_clock_info *info, struct ptp_clock_request *rq, int on) { struct ice_pf *pf = ptp_info_to_pf(info); - struct ice_perout_channel clk_cfg = {0}; bool sma_pres = false; unsigned int chan; u32 gpio_pin; - int err; if (ice_is_feature_supported(pf, ICE_F_SMA_CTRL)) sma_pres = true; switch (rq->type) { case PTP_CLK_REQ_PEROUT: + { + struct ice_perout_channel clk_cfg = {}; + chan = rq->perout.index; if (sma_pres) { if (chan == ice_pin_desc_e810t[SMA1].chan) @@ -1825,15 +1879,19 @@ ice_ptp_gpio_enable_e810(struct ptp_clock_info *info, clk_cfg.gpio_pin = chan; } + clk_cfg.flags = rq->perout.flags; clk_cfg.period = ((rq->perout.period.sec * NSEC_PER_SEC) + rq->perout.period.nsec); clk_cfg.start_time = ((rq->perout.start.sec * NSEC_PER_SEC) + rq->perout.start.nsec); clk_cfg.ena = !!on; - err = ice_ptp_cfg_clkout(pf, chan, &clk_cfg, true); - break; + return ice_ptp_cfg_clkout(pf, chan, &clk_cfg, true); + } case PTP_CLK_REQ_EXTTS: + { + struct ice_extts_channel extts_cfg = {}; + chan = rq->extts.index; if (sma_pres) { if (chan < ice_pin_desc_e810t[SMA2].chan) @@ -1849,14 +1907,15 @@ ice_ptp_gpio_enable_e810(struct ptp_clock_info *info, gpio_pin = chan; } - err = ice_ptp_cfg_extts(pf, !!on, chan, gpio_pin, - rq->extts.flags); - break; + extts_cfg.flags = rq->extts.flags; + extts_cfg.gpio_pin = gpio_pin; + extts_cfg.ena = !!on; + + return ice_ptp_cfg_extts(pf, chan, &extts_cfg, true); + } default: return -EOPNOTSUPP; } - - return err; } /** @@ -1869,26 +1928,32 @@ static int ice_ptp_gpio_enable_e823(struct ptp_clock_info *info, struct ptp_clock_request *rq, int on) { struct ice_pf *pf = ptp_info_to_pf(info); - struct ice_perout_channel clk_cfg = {0}; - int err; switch (rq->type) { case PTP_CLK_REQ_PPS: + { + struct ice_perout_channel clk_cfg = {}; + + clk_cfg.flags = rq->perout.flags; clk_cfg.gpio_pin = PPS_PIN_INDEX; clk_cfg.period = NSEC_PER_SEC; clk_cfg.ena = !!on; - err = ice_ptp_cfg_clkout(pf, PPS_CLK_GEN_CHAN, &clk_cfg, true); - break; + return ice_ptp_cfg_clkout(pf, PPS_CLK_GEN_CHAN, &clk_cfg, true); + } case PTP_CLK_REQ_EXTTS: - err = ice_ptp_cfg_extts(pf, !!on, rq->extts.index, - TIME_SYNC_PIN_INDEX, rq->extts.flags); - break; + { + struct ice_extts_channel extts_cfg = {}; + + extts_cfg.flags = rq->extts.flags; + extts_cfg.gpio_pin = TIME_SYNC_PIN_INDEX; + extts_cfg.ena = !!on; + + return ice_ptp_cfg_extts(pf, rq->extts.index, &extts_cfg, true); + } default: return -EOPNOTSUPP; } - - return err; } /** @@ -2091,7 +2156,8 @@ ice_ptp_get_syncdevicetime(ktime_t *device, hh_ts_lo = rd32(hw, GLHH_ART_TIME_L); hh_ts_hi = rd32(hw, GLHH_ART_TIME_H); hh_ts = ((u64)hh_ts_hi << 32) | hh_ts_lo; - *system = convert_art_ns_to_tsc(hh_ts); + system->cycles = hh_ts; + system->cs_id = CSID_X86_ART; /* Read Device source clock time */ hh_ts_lo = rd32(hw, GLTSYN_HHTIME_L(tmr_idx)); hh_ts_hi = rd32(hw, GLTSYN_HHTIME_H(tmr_idx)); @@ -2720,6 +2786,10 @@ static int ice_ptp_rebuild_owner(struct ice_pf *pf) ice_ptp_restart_all_phy(pf); } + /* Re-enable all periodic outputs and external timestamp events */ + ice_ptp_enable_all_clkout(pf); + ice_ptp_enable_all_extts(pf); + return 0; } @@ -3275,6 +3345,8 @@ void ice_ptp_release(struct ice_pf *pf) ice_ptp_release_tx_tracker(pf, &pf->ptp.port.tx); + ice_ptp_disable_all_extts(pf); + kthread_cancel_delayed_work_sync(&pf->ptp.work); ice_ptp_port_phy_stop(&pf->ptp.port); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.h b/drivers/net/ethernet/intel/ice/ice_ptp.h index 3af20025043a..e2af9749061c 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp.h @@ -29,10 +29,17 @@ enum ice_ptp_pin_e810t { struct ice_perout_channel { bool ena; u32 gpio_pin; + u32 flags; u64 period; u64 start_time; }; +struct ice_extts_channel { + bool ena; + u32 gpio_pin; + u32 flags; +}; + /* The ice hardware captures Tx hardware timestamps in the PHY. The timestamp * is stored in a buffer of registers. Depending on the specific hardware, * this buffer might be shared across multiple PHY ports. @@ -226,6 +233,7 @@ enum ice_ptp_state { * @ext_ts_irq: the external timestamp IRQ in use * @kworker: kwork thread for handling periodic work * @perout_channels: periodic output data + * @extts_channels: channels for external timestamps * @info: structure defining PTP hardware capabilities * @clock: pointer to registered PTP clock device * @tstamp_config: hardware timestamping configuration @@ -249,6 +257,7 @@ struct ice_ptp { u8 ext_ts_irq; struct kthread_worker *kworker; struct ice_perout_channel perout_channels[GLTSYN_TGT_H_IDX_MAX]; + struct ice_extts_channel extts_channels[GLTSYN_TGT_H_IDX_MAX]; struct ptp_clock_info info; struct ptp_clock *clock; struct hwtstamp_config tstamp_config; diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 1bb026232efc..946edbad4302 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -938,7 +938,11 @@ static bool igc_is_crosststamp_supported(struct igc_adapter *adapter) static struct system_counterval_t igc_device_tstamp_to_system(u64 tstamp) { #if IS_ENABLED(CONFIG_X86_TSC) && !defined(CONFIG_UML) - return convert_art_ns_to_tsc(tstamp); + return (struct system_counterval_t) { + .cs_id = CSID_X86_ART, + .cycles = tstamp, + .use_nsecs = true, + }; #else return (struct system_counterval_t) { }; #endif diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 5352fee62d2b..0b9982804370 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -217,9 +217,9 @@ ltq_etop_free_channel(struct net_device *dev, struct ltq_etop_chan *ch) if (ch->dma.irq) free_irq(ch->dma.irq, priv); if (IS_RX(ch->idx)) { - int desc; + struct ltq_dma_channel *dma = &ch->dma; - for (desc = 0; desc < LTQ_DESC_NUM; desc++) + for (dma->desc = 0; dma->desc < LTQ_DESC_NUM; dma->desc++) dev_kfree_skb_any(ch->skb[ch->dma.desc]); } } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index 4a77f6fe2622..05b84581d5c5 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -1745,7 +1745,7 @@ struct cpt_lf_alloc_req_msg { u16 nix_pf_func; u16 sso_pf_func; u16 eng_grpmsk; - int blkaddr; + u8 blkaddr; u8 ctx_ilen_valid : 1; u8 ctx_ilen : 7; }; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/npc.h index d883157393ea..6c3aca6f278d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h @@ -63,8 +63,13 @@ enum npc_kpu_lb_ltype { NPC_LT_LB_CUSTOM1 = 0xF, }; +/* Don't modify ltypes up to IP6_EXT, otherwise length and checksum of IP + * headers may not be checked correctly. IPv4 ltypes and IPv6 ltypes must + * differ only at bit 0 so mask 0xE can be used to detect extended headers. + */ enum npc_kpu_lc_ltype { - NPC_LT_LC_IP = 1, + NPC_LT_LC_PTP = 1, + NPC_LT_LC_IP, NPC_LT_LC_IP_OPT, NPC_LT_LC_IP6, NPC_LT_LC_IP6_EXT, @@ -72,7 +77,6 @@ enum npc_kpu_lc_ltype { NPC_LT_LC_RARP, NPC_LT_LC_MPLS, NPC_LT_LC_NSH, - NPC_LT_LC_PTP, NPC_LT_LC_FCOE, NPC_LT_LC_NGIO, NPC_LT_LC_CUSTOM0 = 0xE, diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index ff78251f92d4..5f661e67ccbc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -1643,7 +1643,7 @@ static int rvu_check_rsrc_availability(struct rvu *rvu, if (req->ssow > block->lf.max) { dev_err(&rvu->pdev->dev, "Func 0x%x: Invalid SSOW req, %d > max %d\n", - pcifunc, req->sso, block->lf.max); + pcifunc, req->ssow, block->lf.max); return -EINVAL; } mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->addr); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c index f047185f38e0..3e09d2285814 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c @@ -696,7 +696,8 @@ int rvu_mbox_handler_cpt_rd_wr_register(struct rvu *rvu, struct cpt_rd_wr_reg_msg *req, struct cpt_rd_wr_reg_msg *rsp) { - int blkaddr; + u64 offset = req->reg_offset; + int blkaddr, lf; blkaddr = validate_and_get_cpt_blkaddr(req->blkaddr); if (blkaddr < 0) @@ -707,17 +708,25 @@ int rvu_mbox_handler_cpt_rd_wr_register(struct rvu *rvu, !is_cpt_vf(rvu, req->hdr.pcifunc)) return CPT_AF_ERR_ACCESS_DENIED; - rsp->reg_offset = req->reg_offset; - rsp->ret_val = req->ret_val; - rsp->is_write = req->is_write; - if (!is_valid_offset(rvu, req)) return CPT_AF_ERR_ACCESS_DENIED; + /* Translate local LF used by VFs to global CPT LF */ + lf = rvu_get_lf(rvu, &rvu->hw->block[blkaddr], req->hdr.pcifunc, + (offset & 0xFFF) >> 3); + + /* Translate local LF's offset to global CPT LF's offset */ + offset &= 0xFF000; + offset += lf << 3; + + rsp->reg_offset = offset; + rsp->ret_val = req->ret_val; + rsp->is_write = req->is_write; + if (req->is_write) - rvu_write64(rvu, blkaddr, req->reg_offset, req->val); + rvu_write64(rvu, blkaddr, offset, req->val); else - rsp->val = rvu_read64(rvu, blkaddr, req->reg_offset); + rsp->val = rvu_read64(rvu, blkaddr, offset); return 0; } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 00af8888e329..3dc828cf6c5a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -3864,6 +3864,11 @@ static int get_flowkey_alg_idx(struct nix_hw *nix_hw, u32 flow_cfg) return -ERANGE; } +/* Mask to match ipv6(NPC_LT_LC_IP6) and ipv6 ext(NPC_LT_LC_IP6_EXT) */ +#define NPC_LT_LC_IP6_MATCH_MSK ((~(NPC_LT_LC_IP6 ^ NPC_LT_LC_IP6_EXT)) & 0xf) +/* Mask to match both ipv4(NPC_LT_LC_IP) and ipv4 ext(NPC_LT_LC_IP_OPT) */ +#define NPC_LT_LC_IP_MATCH_MSK ((~(NPC_LT_LC_IP ^ NPC_LT_LC_IP_OPT)) & 0xf) + static int set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg) { int idx, nr_field, key_off, field_marker, keyoff_marker; @@ -3933,7 +3938,7 @@ static int set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg) field->hdr_offset = 9; /* offset */ field->bytesm1 = 0; /* 1 byte */ field->ltype_match = NPC_LT_LC_IP; - field->ltype_mask = 0xF; + field->ltype_mask = NPC_LT_LC_IP_MATCH_MSK; break; case NIX_FLOW_KEY_TYPE_IPV4: case NIX_FLOW_KEY_TYPE_INNR_IPV4: @@ -3960,8 +3965,7 @@ static int set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg) field->bytesm1 = 3; /* DIP, 4 bytes */ } } - - field->ltype_mask = 0xF; /* Match only IPv4 */ + field->ltype_mask = NPC_LT_LC_IP_MATCH_MSK; keyoff_marker = false; break; case NIX_FLOW_KEY_TYPE_IPV6: @@ -3990,7 +3994,7 @@ static int set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg) field->bytesm1 = 15; /* DIP,16 bytes */ } } - field->ltype_mask = 0xF; /* Match only IPv6 */ + field->ltype_mask = NPC_LT_LC_IP6_MATCH_MSK; break; case NIX_FLOW_KEY_TYPE_TCP: case NIX_FLOW_KEY_TYPE_UDP: diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 31aebeb2e285..25989c79c92e 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -1524,6 +1524,7 @@ static int mtk_star_probe(struct platform_device *pdev) { struct device_node *of_node; struct mtk_star_priv *priv; + struct phy_device *phydev; struct net_device *ndev; struct device *dev; void __iomem *base; @@ -1649,6 +1650,12 @@ static int mtk_star_probe(struct platform_device *pdev) netif_napi_add(ndev, &priv->rx_napi, mtk_star_rx_poll); netif_napi_add_tx(ndev, &priv->tx_napi, mtk_star_tx_poll); + phydev = of_phy_find_device(priv->phy_node); + if (phydev) { + phydev->mac_managed_pm = true; + put_device(&phydev->mdio.dev); + } + return devm_register_netdev(dev, ndev); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index c54fd01ea635..3d274599015b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -989,7 +989,12 @@ static void mlx5e_xfrm_update_stats(struct xfrm_state *x) struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x); struct mlx5e_ipsec_rule *ipsec_rule = &sa_entry->ipsec_rule; struct net *net = dev_net(x->xso.dev); + u64 trailer_packets = 0, trailer_bytes = 0; + u64 replay_packets = 0, replay_bytes = 0; + u64 auth_packets = 0, auth_bytes = 0; + u64 success_packets, success_bytes; u64 packets, bytes, lastuse; + size_t headers; lockdep_assert(lockdep_is_held(&x->lock) || lockdep_is_held(&dev_net(x->xso.real_dev)->xfrm.xfrm_cfg_mutex) || @@ -999,26 +1004,43 @@ static void mlx5e_xfrm_update_stats(struct xfrm_state *x) return; if (sa_entry->attrs.dir == XFRM_DEV_OFFLOAD_IN) { - mlx5_fc_query_cached(ipsec_rule->auth.fc, &bytes, &packets, &lastuse); - x->stats.integrity_failed += packets; - XFRM_ADD_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR, packets); - - mlx5_fc_query_cached(ipsec_rule->trailer.fc, &bytes, &packets, &lastuse); - XFRM_ADD_STATS(net, LINUX_MIB_XFRMINHDRERROR, packets); + mlx5_fc_query_cached(ipsec_rule->auth.fc, &auth_bytes, + &auth_packets, &lastuse); + x->stats.integrity_failed += auth_packets; + XFRM_ADD_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR, auth_packets); + + mlx5_fc_query_cached(ipsec_rule->trailer.fc, &trailer_bytes, + &trailer_packets, &lastuse); + XFRM_ADD_STATS(net, LINUX_MIB_XFRMINHDRERROR, trailer_packets); } if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) return; - mlx5_fc_query_cached(ipsec_rule->fc, &bytes, &packets, &lastuse); - x->curlft.packets += packets; - x->curlft.bytes += bytes; - if (sa_entry->attrs.dir == XFRM_DEV_OFFLOAD_IN) { - mlx5_fc_query_cached(ipsec_rule->replay.fc, &bytes, &packets, &lastuse); - x->stats.replay += packets; - XFRM_ADD_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR, packets); + mlx5_fc_query_cached(ipsec_rule->replay.fc, &replay_bytes, + &replay_packets, &lastuse); + x->stats.replay += replay_packets; + XFRM_ADD_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR, replay_packets); } + + mlx5_fc_query_cached(ipsec_rule->fc, &bytes, &packets, &lastuse); + success_packets = packets - auth_packets - trailer_packets - replay_packets; + x->curlft.packets += success_packets; + /* NIC counts all bytes passed through flow steering and doesn't have + * an ability to count payload data size which is needed for SA. + * + * To overcome HW limitestion, let's approximate the payload size + * by removing always available headers. + */ + headers = sizeof(struct ethhdr); + if (sa_entry->attrs.family == AF_INET) + headers += sizeof(struct iphdr); + else + headers += sizeof(struct ipv6hdr); + + success_bytes = bytes - auth_bytes - trailer_bytes - replay_bytes; + x->curlft.bytes += success_bytes - headers * success_packets; } static int mlx5e_xfrm_validate_policy(struct mlx5_core_dev *mdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a605eae56685..eedbcba22689 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5868,6 +5868,11 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv) kfree(priv->htb_qos_sq_stats[i]); kvfree(priv->htb_qos_sq_stats); + if (priv->mqprio_rl) { + mlx5e_mqprio_rl_cleanup(priv->mqprio_rl); + mlx5e_mqprio_rl_free(priv->mqprio_rl); + } + memset(priv, 0, sizeof(*priv)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 5693986ae656..ac1565c0c8af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -1197,9 +1197,7 @@ static int get_num_eqs(struct mlx5_core_dev *dev) if (!mlx5_core_is_eth_enabled(dev) && mlx5_eth_supported(dev)) return 1; - max_dev_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ? - MLX5_CAP_GEN(dev, max_num_eqs) : - 1 << MLX5_CAP_GEN(dev, log_max_eq); + max_dev_eqs = mlx5_max_eq_cap_get(dev); num_eqs = min_t(int, mlx5_irq_table_get_num_comp(eq_table->irq_table), max_dev_eqs - MLX5_MAX_ASYNC_EQS); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c index 50d2ea323979..a436ce895e45 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c @@ -6,6 +6,9 @@ #include "helper.h" #include "ofld.h" +static int +acl_ingress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + static bool esw_acl_ingress_prio_tag_enabled(struct mlx5_eswitch *esw, const struct mlx5_vport *vport) @@ -123,18 +126,31 @@ static int esw_acl_ingress_src_port_drop_create(struct mlx5_eswitch *esw, { struct mlx5_flow_act flow_act = {}; struct mlx5_flow_handle *flow_rule; + bool created = false; int err = 0; + if (!vport->ingress.acl) { + err = acl_ingress_ofld_setup(esw, vport); + if (err) + return err; + created = true; + } + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; flow_act.fg = vport->ingress.offloads.drop_grp; flow_rule = mlx5_add_flow_rules(vport->ingress.acl, NULL, &flow_act, NULL, 0); if (IS_ERR(flow_rule)) { err = PTR_ERR(flow_rule); - goto out; + goto err_out; } vport->ingress.offloads.drop_rule = flow_rule; -out: + + return 0; +err_out: + /* Only destroy ingress acl created in this function. */ + if (created) + esw_acl_ingress_ofld_cleanup(esw, vport); return err; } @@ -299,16 +315,12 @@ static void esw_acl_ingress_ofld_groups_destroy(struct mlx5_vport *vport) } } -int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, - struct mlx5_vport *vport) +static int +acl_ingress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { int num_ftes = 0; int err; - if (!mlx5_eswitch_vport_match_metadata_enabled(esw) && - !esw_acl_ingress_prio_tag_enabled(esw, vport)) - return 0; - esw_acl_ingress_allow_rule_destroy(vport); if (mlx5_eswitch_vport_match_metadata_enabled(esw)) @@ -347,6 +359,15 @@ group_err: return err; } +int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + if (!mlx5_eswitch_vport_match_metadata_enabled(esw) && + !esw_acl_ingress_prio_tag_enabled(esw, vport)) + return 0; + + return acl_ingress_ofld_setup(esw, vport); +} + void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 592143d5e1da..72949cb85244 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -4600,20 +4600,26 @@ mlx5_devlink_port_fn_max_io_eqs_get(struct devlink_port *port, u32 *max_io_eqs, return -EOPNOTSUPP; } + if (!MLX5_CAP_GEN_2(esw->dev, max_num_eqs_24b)) { + NL_SET_ERR_MSG_MOD(extack, + "Device doesn't support getting the max number of EQs"); + return -EOPNOTSUPP; + } + query_ctx = kzalloc(query_out_sz, GFP_KERNEL); if (!query_ctx) return -ENOMEM; mutex_lock(&esw->state_lock); err = mlx5_vport_get_other_func_cap(esw->dev, vport_num, query_ctx, - MLX5_CAP_GENERAL); + MLX5_CAP_GENERAL_2); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed getting HCA caps"); goto out; } hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); - max_eqs = MLX5_GET(cmd_hca_cap, hca_caps, max_num_eqs); + max_eqs = MLX5_GET(cmd_hca_cap_2, hca_caps, max_num_eqs_24b); if (max_eqs < MLX5_ESW_MAX_CTRL_EQS) *max_io_eqs = 0; else @@ -4644,6 +4650,12 @@ mlx5_devlink_port_fn_max_io_eqs_set(struct devlink_port *port, u32 max_io_eqs, return -EOPNOTSUPP; } + if (!MLX5_CAP_GEN_2(esw->dev, max_num_eqs_24b)) { + NL_SET_ERR_MSG_MOD(extack, + "Device doesn't support changing the max number of EQs"); + return -EOPNOTSUPP; + } + if (check_add_overflow(max_io_eqs, MLX5_ESW_MAX_CTRL_EQS, &max_eqs)) { NL_SET_ERR_MSG_MOD(extack, "Supplied value out of range"); return -EINVAL; @@ -4655,17 +4667,17 @@ mlx5_devlink_port_fn_max_io_eqs_set(struct devlink_port *port, u32 max_io_eqs, mutex_lock(&esw->state_lock); err = mlx5_vport_get_other_func_cap(esw->dev, vport_num, query_ctx, - MLX5_CAP_GENERAL); + MLX5_CAP_GENERAL_2); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed getting HCA caps"); goto out; } hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); - MLX5_SET(cmd_hca_cap, hca_caps, max_num_eqs, max_eqs); + MLX5_SET(cmd_hca_cap_2, hca_caps, max_num_eqs_24b, max_eqs); err = mlx5_vport_set_other_func_cap(esw->dev, hca_caps, vport_num, - MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2); if (err) NL_SET_ERR_MSG_MOD(extack, "Failed setting HCA caps"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index c38342b9f320..a7fd18888b6e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -383,4 +383,14 @@ static inline int mlx5_vport_to_func_id(const struct mlx5_core_dev *dev, u16 vpo : vport; } +static inline int mlx5_max_eq_cap_get(const struct mlx5_core_dev *dev) +{ + if (MLX5_CAP_GEN_2(dev, max_num_eqs_24b)) + return MLX5_CAP_GEN_2(dev, max_num_eqs_24b); + + if (MLX5_CAP_GEN(dev, max_num_eqs)) + return MLX5_CAP_GEN(dev, max_num_eqs); + + return 1 << MLX5_CAP_GEN(dev, log_max_eq); +} #endif /* __MLX5_CORE_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index fb8787e30d3f..401d39069680 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -711,9 +711,7 @@ int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table) int mlx5_irq_table_create(struct mlx5_core_dev *dev) { - int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ? - MLX5_CAP_GEN(dev, max_num_eqs) : - 1 << MLX5_CAP_GEN(dev, log_max_eq); + int num_eqs = mlx5_max_eq_cap_get(dev); int total_vec; int pcif_vec; int req_vec; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c index 025e0db983fe..b032d5a4b3b8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c @@ -1484,6 +1484,7 @@ err_type_file_file_validate: vfree(types_info->data); err_data_alloc: kfree(types_info); + linecards->types_info = NULL; return err; } diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 6453c92f0fa7..7fa1820db9cc 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -352,11 +352,11 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) netif_dbg(ks, intr, ks->netdev, "%s: txspace %d\n", __func__, tx_space); - spin_lock(&ks->statelock); + spin_lock_bh(&ks->statelock); ks->tx_space = tx_space; if (netif_queue_stopped(ks->netdev)) netif_wake_queue(ks->netdev); - spin_unlock(&ks->statelock); + spin_unlock_bh(&ks->statelock); } if (status & IRQ_SPIBEI) { @@ -482,6 +482,7 @@ static int ks8851_net_open(struct net_device *dev) ks8851_wrreg16(ks, KS_IER, ks->rc_ier); ks->queued_len = 0; + ks->tx_space = ks8851_rdreg16(ks, KS_TXMIR); netif_start_queue(ks->netdev); netif_dbg(ks, ifup, ks->netdev, "network device up\n"); @@ -635,14 +636,14 @@ static void ks8851_set_rx_mode(struct net_device *dev) /* schedule work to do the actual set of the data if needed */ - spin_lock(&ks->statelock); + spin_lock_bh(&ks->statelock); if (memcmp(&rxctrl, &ks->rxctrl, sizeof(rxctrl)) != 0) { memcpy(&ks->rxctrl, &rxctrl, sizeof(ks->rxctrl)); schedule_work(&ks->rxctrl_work); } - spin_unlock(&ks->statelock); + spin_unlock_bh(&ks->statelock); } static int ks8851_set_mac_address(struct net_device *dev, void *addr) @@ -1101,7 +1102,6 @@ int ks8851_probe_common(struct net_device *netdev, struct device *dev, int ret; ks->netdev = netdev; - ks->tx_space = 6144; ks->gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); ret = PTR_ERR_OR_ZERO(ks->gpio); diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c index 670c1de966db..3062cc0f9199 100644 --- a/drivers/net/ethernet/micrel/ks8851_spi.c +++ b/drivers/net/ethernet/micrel/ks8851_spi.c @@ -340,10 +340,10 @@ static void ks8851_tx_work(struct work_struct *work) tx_space = ks8851_rdreg16_spi(ks, KS_TXMIR); - spin_lock(&ks->statelock); + spin_lock_bh(&ks->statelock); ks->queued_len -= dequeued_len; ks->tx_space = tx_space; - spin_unlock(&ks->statelock); + spin_unlock_bh(&ks->statelock); ks8851_unlock_spi(ks, &flags); } diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index dcab638c57fe..24c90d8f5a44 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -871,13 +871,13 @@ static void rswitch_tx_free(struct net_device *ndev) dma_rmb(); skb = gq->skbs[gq->dirty]; if (skb) { + rdev->ndev->stats.tx_packets++; + rdev->ndev->stats.tx_bytes += skb->len; dma_unmap_single(ndev->dev.parent, gq->unmap_addrs[gq->dirty], skb->len, DMA_TO_DEVICE); dev_kfree_skb_any(gq->skbs[gq->dirty]); gq->skbs[gq->dirty] = NULL; - rdev->ndev->stats.tx_packets++; - rdev->ndev->stats.tx_bytes += skb->len; } desc->desc.die_dt = DT_EEMPTY; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 60283543ffc8..e73fa34237d3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -390,10 +390,11 @@ static int intel_crosststamp(ktime_t *device, *device = ns_to_ktime(ptp_time); read_unlock_irqrestore(&priv->ptp_lock, flags); get_arttime(priv->mii, intel_priv->mdio_adhoc_addr, &art_time); - *system = convert_art_to_tsc(art_time); + system->cycles = art_time; } system->cycles *= intel_priv->crossts_adj; + system->cs_id = CSID_X86_ART; priv->plat->flags &= ~STMMAC_FLAG_INT_SNAPSHOT_EN; return 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 65d7370b47d5..466c4002f00d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -272,7 +272,7 @@ static const struct ethqos_emac_por emac_v4_0_0_por[] = { static const struct ethqos_emac_driver_data emac_v4_0_0_data = { .por = emac_v4_0_0_por, - .num_por = ARRAY_SIZE(emac_v3_0_0_por), + .num_por = ARRAY_SIZE(emac_v4_0_0_por), .rgmii_config_loopback_en = false, .has_emac_ge_3 = true, .link_clk_name = "phyaux", diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b3afc7cb7d72..c58782c41417 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7662,9 +7662,10 @@ int stmmac_dvr_probe(struct device *device, #ifdef STMMAC_VLAN_TAG_USED /* Both mac100 and gmac support receive VLAN tag detection */ ndev->features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX; - ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX; - priv->hw->hw_vlan_en = true; - + if (priv->plat->has_gmac4) { + ndev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX; + priv->hw->hw_vlan_en = true; + } if (priv->dma_cap.vlhash) { ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; ndev->features |= NETIF_F_HW_VLAN_STAG_FILTER; diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 7c4b6881a93f..d1b682ce9c6d 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -1959,6 +1959,7 @@ int wx_sw_init(struct wx *wx) } bitmap_zero(wx->state, WX_STATE_NBITS); + wx->misc_irq_domain = false; return 0; } diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 68bde91b67a0..81bedc8ee8d4 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -1686,6 +1686,7 @@ static int wx_set_interrupt_capability(struct wx *wx) } pdev->irq = pci_irq_vector(pdev, 0); + wx->num_q_vectors = 1; return 0; } @@ -1996,7 +1997,8 @@ void wx_free_irq(struct wx *wx) int vector; if (!(pdev->msix_enabled)) { - free_irq(pdev->irq, wx); + if (!wx->misc_irq_domain) + free_irq(pdev->irq, wx); return; } @@ -2011,7 +2013,7 @@ void wx_free_irq(struct wx *wx) free_irq(entry->vector, q_vector); } - if (wx->mac.type == wx_mac_em) + if (!wx->misc_irq_domain) free_irq(wx->msix_entry->vector, wx); } EXPORT_SYMBOL(wx_free_irq); @@ -2026,6 +2028,9 @@ int wx_setup_isb_resources(struct wx *wx) { struct pci_dev *pdev = wx->pdev; + if (wx->isb_mem) + return 0; + wx->isb_mem = dma_alloc_coherent(&pdev->dev, sizeof(u32) * 4, &wx->isb_dma, @@ -2385,7 +2390,6 @@ static void wx_free_all_tx_resources(struct wx *wx) void wx_free_resources(struct wx *wx) { - wx_free_isb_resources(wx); wx_free_all_rx_resources(wx); wx_free_all_tx_resources(wx); } diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 5aaf7b1fa2db..0df7f5712b6f 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -1058,6 +1058,7 @@ struct wx { dma_addr_t isb_dma; u32 *isb_mem; u32 isb_tag[WX_ISB_MAX]; + bool misc_irq_domain; #define WX_MAX_RETA_ENTRIES 128 #define WX_RSS_INDIR_TBL_MAX 64 diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c index e894e01d030d..af30ca0312b8 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c @@ -387,6 +387,7 @@ err_dis_phy: err_free_irq: wx_free_irq(wx); err_free_resources: + wx_free_isb_resources(wx); wx_free_resources(wx); return err; } @@ -408,6 +409,7 @@ static int ngbe_close(struct net_device *netdev) ngbe_down(wx); wx_free_irq(wx); + wx_free_isb_resources(wx); wx_free_resources(wx); phylink_disconnect_phy(wx->phylink); wx_control_hw(wx, false); diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c index b3e3605d1edb..a4cf682dca65 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c @@ -27,57 +27,19 @@ void txgbe_irq_enable(struct wx *wx, bool queues) } /** - * txgbe_intr - msi/legacy mode Interrupt Handler - * @irq: interrupt number - * @data: pointer to a network interface device structure - **/ -static irqreturn_t txgbe_intr(int __always_unused irq, void *data) -{ - struct wx_q_vector *q_vector; - struct wx *wx = data; - struct pci_dev *pdev; - u32 eicr; - - q_vector = wx->q_vector[0]; - pdev = wx->pdev; - - eicr = wx_misc_isb(wx, WX_ISB_VEC0); - if (!eicr) { - /* shared interrupt alert! - * the interrupt that we masked before the ICR read. - */ - if (netif_running(wx->netdev)) - txgbe_irq_enable(wx, true); - return IRQ_NONE; /* Not our interrupt */ - } - wx->isb_mem[WX_ISB_VEC0] = 0; - if (!(pdev->msi_enabled)) - wr32(wx, WX_PX_INTA, 1); - - wx->isb_mem[WX_ISB_MISC] = 0; - /* would disable interrupts here but it is auto disabled */ - napi_schedule_irqoff(&q_vector->napi); - - /* re-enable link(maybe) and non-queue interrupts, no flush. - * txgbe_poll will re-enable the queue interrupts - */ - if (netif_running(wx->netdev)) - txgbe_irq_enable(wx, false); - - return IRQ_HANDLED; -} - -/** - * txgbe_request_msix_irqs - Initialize MSI-X interrupts + * txgbe_request_queue_irqs - Initialize MSI-X queue interrupts * @wx: board private structure * - * Allocate MSI-X vectors and request interrupts from the kernel. + * Allocate MSI-X queue vectors and request interrupts from the kernel. **/ -static int txgbe_request_msix_irqs(struct wx *wx) +int txgbe_request_queue_irqs(struct wx *wx) { struct net_device *netdev = wx->netdev; int vector, err; + if (!wx->pdev->msix_enabled) + return 0; + for (vector = 0; vector < wx->num_q_vectors; vector++) { struct wx_q_vector *q_vector = wx->q_vector[vector]; struct msix_entry *entry = &wx->msix_q_entries[vector]; @@ -110,34 +72,6 @@ free_queue_irqs: return err; } -/** - * txgbe_request_irq - initialize interrupts - * @wx: board private structure - * - * Attempt to configure interrupts using the best available - * capabilities of the hardware and kernel. - **/ -int txgbe_request_irq(struct wx *wx) -{ - struct net_device *netdev = wx->netdev; - struct pci_dev *pdev = wx->pdev; - int err; - - if (pdev->msix_enabled) - err = txgbe_request_msix_irqs(wx); - else if (pdev->msi_enabled) - err = request_irq(wx->pdev->irq, &txgbe_intr, 0, - netdev->name, wx); - else - err = request_irq(wx->pdev->irq, &txgbe_intr, IRQF_SHARED, - netdev->name, wx); - - if (err) - wx_err(wx, "request_irq failed, Error %d\n", err); - - return err; -} - static int txgbe_request_gpio_irq(struct txgbe *txgbe) { txgbe->gpio_irq = irq_find_mapping(txgbe->misc.domain, TXGBE_IRQ_GPIO); @@ -178,6 +112,36 @@ static const struct irq_domain_ops txgbe_misc_irq_domain_ops = { static irqreturn_t txgbe_misc_irq_handle(int irq, void *data) { + struct wx_q_vector *q_vector; + struct txgbe *txgbe = data; + struct wx *wx = txgbe->wx; + u32 eicr; + + if (wx->pdev->msix_enabled) + return IRQ_WAKE_THREAD; + + eicr = wx_misc_isb(wx, WX_ISB_VEC0); + if (!eicr) { + /* shared interrupt alert! + * the interrupt that we masked before the ICR read. + */ + if (netif_running(wx->netdev)) + txgbe_irq_enable(wx, true); + return IRQ_NONE; /* Not our interrupt */ + } + wx->isb_mem[WX_ISB_VEC0] = 0; + if (!(wx->pdev->msi_enabled)) + wr32(wx, WX_PX_INTA, 1); + + /* would disable interrupts here but it is auto disabled */ + q_vector = wx->q_vector[0]; + napi_schedule_irqoff(&q_vector->napi); + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t txgbe_misc_irq_thread_fn(int irq, void *data) +{ struct txgbe *txgbe = data; struct wx *wx = txgbe->wx; unsigned int nhandled = 0; @@ -223,6 +187,7 @@ void txgbe_free_misc_irq(struct txgbe *txgbe) int txgbe_setup_misc_irq(struct txgbe *txgbe) { + unsigned long flags = IRQF_ONESHOT; struct wx *wx = txgbe->wx; int hwirq, err; @@ -236,14 +201,17 @@ int txgbe_setup_misc_irq(struct txgbe *txgbe) irq_create_mapping(txgbe->misc.domain, hwirq); txgbe->misc.chip = txgbe_irq_chip; - if (wx->pdev->msix_enabled) + if (wx->pdev->msix_enabled) { txgbe->misc.irq = wx->msix_entry->vector; - else + } else { txgbe->misc.irq = wx->pdev->irq; + if (!wx->pdev->msi_enabled) + flags |= IRQF_SHARED; + } - err = request_threaded_irq(txgbe->misc.irq, NULL, - txgbe_misc_irq_handle, - IRQF_ONESHOT, + err = request_threaded_irq(txgbe->misc.irq, txgbe_misc_irq_handle, + txgbe_misc_irq_thread_fn, + flags, wx->netdev->name, txgbe); if (err) goto del_misc_irq; @@ -256,6 +224,8 @@ int txgbe_setup_misc_irq(struct txgbe *txgbe) if (err) goto free_gpio_irq; + wx->misc_irq_domain = true; + return 0; free_gpio_irq: diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h b/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h index b77945e7a0f2..e6285b94625e 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.h @@ -2,6 +2,6 @@ /* Copyright (c) 2015 - 2024 Beijing WangXun Technology Co., Ltd. */ void txgbe_irq_enable(struct wx *wx, bool queues); -int txgbe_request_irq(struct wx *wx); +int txgbe_request_queue_irqs(struct wx *wx); void txgbe_free_misc_irq(struct txgbe *txgbe); int txgbe_setup_misc_irq(struct txgbe *txgbe); diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index 8c7a74981b90..ca74d9422065 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -294,9 +294,9 @@ static int txgbe_open(struct net_device *netdev) wx_configure(wx); - err = txgbe_request_irq(wx); + err = txgbe_request_queue_irqs(wx); if (err) - goto err_free_isb; + goto err_free_resources; /* Notify the stack of the actual queue counts. */ err = netif_set_real_num_tx_queues(netdev, wx->num_tx_queues); @@ -313,8 +313,8 @@ static int txgbe_open(struct net_device *netdev) err_free_irq: wx_free_irq(wx); -err_free_isb: - wx_free_isb_resources(wx); +err_free_resources: + wx_free_resources(wx); err_reset: txgbe_reset(wx); @@ -729,6 +729,7 @@ static void txgbe_remove(struct pci_dev *pdev) txgbe_remove_phy(txgbe); txgbe_free_misc_irq(txgbe); + wx_free_isb_resources(wx); pci_release_selected_regions(pdev, pci_select_bars(pdev, IORESOURCE_MEM)); diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c index ffe7f463e16e..ef6df0e37bea 100644 --- a/drivers/net/ntb_netdev.c +++ b/drivers/net/ntb_netdev.c @@ -119,7 +119,7 @@ static void ntb_netdev_rx_handler(struct ntb_transport_qp *qp, void *qp_data, skb->protocol = eth_type_trans(skb, ndev); skb->ip_summed = CHECKSUM_NONE; - if (__netif_rx(skb) == NET_RX_DROP) { + if (netif_rx(skb) == NET_RX_DROP) { ndev->stats.rx_errors++; ndev->stats.rx_dropped++; } else { diff --git a/drivers/net/phy/aquantia/aquantia.h b/drivers/net/phy/aquantia/aquantia.h index 1c19ae74ad2b..4830b25e6c7d 100644 --- a/drivers/net/phy/aquantia/aquantia.h +++ b/drivers/net/phy/aquantia/aquantia.h @@ -6,6 +6,9 @@ * Author: Heiner Kallweit <[email protected]> */ +#ifndef AQUANTIA_H +#define AQUANTIA_H + #include <linux/device.h> #include <linux/phy.h> @@ -120,3 +123,5 @@ static inline int aqr_hwmon_probe(struct phy_device *phydev) { return 0; } #endif int aqr_firmware_load(struct phy_device *phydev); + +#endif /* AQUANTIA_H */ diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c index a838b61cd844..a35528497a57 100644 --- a/drivers/net/phy/microchip_t1.c +++ b/drivers/net/phy/microchip_t1.c @@ -748,7 +748,7 @@ static int lan87xx_cable_test_report(struct phy_device *phydev) ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A, lan87xx_cable_test_report_trans(detect)); - return 0; + return phy_init_hw(phydev); } static int lan87xx_cable_test_get_status(struct phy_device *phydev, diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 0a65b6d690fe..eb9acfcaeb09 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -70,6 +70,7 @@ #define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */ #define PPP_PROTO_LEN 2 +#define PPP_LCP_HDRLEN 4 /* * An instance of /dev/ppp can be associated with either a ppp @@ -493,6 +494,15 @@ static ssize_t ppp_read(struct file *file, char __user *buf, return ret; } +static bool ppp_check_packet(struct sk_buff *skb, size_t count) +{ + /* LCP packets must include LCP header which 4 bytes long: + * 1-byte code, 1-byte identifier, and 2-byte length. + */ + return get_unaligned_be16(skb->data) != PPP_LCP || + count >= PPP_PROTO_LEN + PPP_LCP_HDRLEN; +} + static ssize_t ppp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -515,6 +525,11 @@ static ssize_t ppp_write(struct file *file, const char __user *buf, kfree_skb(skb); goto out; } + ret = -EINVAL; + if (unlikely(!ppp_check_packet(skb, count))) { + kfree_skb(skb); + goto out; + } switch (pf->kind) { case INTERFACE: diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c index 0ba714ca5185..4b8528206cc8 100644 --- a/drivers/net/wireguard/allowedips.c +++ b/drivers/net/wireguard/allowedips.c @@ -15,8 +15,8 @@ static void swap_endian(u8 *dst, const u8 *src, u8 bits) if (bits == 32) { *(u32 *)dst = be32_to_cpu(*(const __be32 *)src); } else if (bits == 128) { - ((u64 *)dst)[0] = be64_to_cpu(((const __be64 *)src)[0]); - ((u64 *)dst)[1] = be64_to_cpu(((const __be64 *)src)[1]); + ((u64 *)dst)[0] = get_unaligned_be64(src); + ((u64 *)dst)[1] = get_unaligned_be64(src + 8); } } diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index 1ea4f874e367..7eb76724b3ed 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -124,10 +124,10 @@ static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) */ static inline int wg_cpumask_next_online(int *last_cpu) { - int cpu = cpumask_next(*last_cpu, cpu_online_mask); + int cpu = cpumask_next(READ_ONCE(*last_cpu), cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); - *last_cpu = cpu; + WRITE_ONCE(*last_cpu, cpu); return cpu; } diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 0d48e0f4a1ba..26e09c30d596 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -222,7 +222,7 @@ void wg_packet_send_keepalive(struct wg_peer *peer) { struct sk_buff *skb; - if (skb_queue_empty(&peer->staged_packet_queue)) { + if (skb_queue_empty_lockless(&peer->staged_packet_queue)) { skb = alloc_skb(DATA_PACKET_HEAD_ROOM + MESSAGE_MINIMUM_LENGTH, GFP_ATOMIC); if (unlikely(!skb)) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 18ce060df9b5..dac6155ae1bd 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -654,7 +654,7 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) hw->wiphy->features |= NL80211_FEATURE_WFA_TPC_IE_IN_PROBES; if (iwl_fw_lookup_cmd_ver(mvm->fw, WOWLAN_KEK_KCK_MATERIAL, - IWL_FW_CMD_VER_UNKNOWN) == 3) + IWL_FW_CMD_VER_UNKNOWN) >= 3) hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK; if (fw_has_api(&mvm->fw->ucode_capa, @@ -1656,7 +1656,8 @@ static void iwl_mvm_prevent_esr_done_wk(struct wiphy *wiphy, struct iwl_mvm_vif *mvmvif = container_of(wk, struct iwl_mvm_vif, prevent_esr_done_wk.work); struct iwl_mvm *mvm = mvmvif->mvm; - struct ieee80211_vif *vif = iwl_mvm_get_bss_vif(mvm); + struct ieee80211_vif *vif = + container_of((void *)mvmvif, struct ieee80211_vif, drv_priv); mutex_lock(&mvm->mutex); iwl_mvm_unblock_esr(mvm, vif, IWL_MVM_ESR_BLOCKED_PREVENTION); @@ -1682,7 +1683,8 @@ static void iwl_mvm_unblock_esr_tpt(struct wiphy *wiphy, struct wiphy_work *wk) struct iwl_mvm_vif *mvmvif = container_of(wk, struct iwl_mvm_vif, unblock_esr_tpt_wk); struct iwl_mvm *mvm = mvmvif->mvm; - struct ieee80211_vif *vif = iwl_mvm_get_bss_vif(mvm); + struct ieee80211_vif *vif = + container_of((void *)mvmvif, struct ieee80211_vif, drv_priv); mutex_lock(&mvm->mutex); iwl_mvm_unblock_esr(mvm, vif, IWL_MVM_ESR_BLOCKED_TPT); @@ -6410,11 +6412,9 @@ void iwl_mvm_sync_rx_queues_internal(struct iwl_mvm *mvm, if (sync) { lockdep_assert_held(&mvm->mutex); ret = wait_event_timeout(mvm->rx_sync_waitq, - READ_ONCE(mvm->queue_sync_state) == 0 || - iwl_mvm_is_radio_hw_killed(mvm), + READ_ONCE(mvm->queue_sync_state) == 0, SYNC_RX_QUEUE_TIMEOUT); - WARN_ONCE(!ret && !iwl_mvm_is_radio_hw_killed(mvm), - "queue sync: failed to sync, state is 0x%lx, cookie %d\n", + WARN_ONCE(!ret, "queue sync: failed to sync, state is 0x%lx, cookie %d\n", mvm->queue_sync_state, mvm->queue_sync_cookie); } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index 53283d052e18..d343432474db 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -153,7 +153,7 @@ static void iwl_mvm_rx_esr_mode_notif(struct iwl_mvm *mvm, struct ieee80211_vif *vif = iwl_mvm_get_bss_vif(mvm); /* FW recommendations is only for entering EMLSR */ - if (!vif || iwl_mvm_vif_from_mac80211(vif)->esr_active) + if (IS_ERR_OR_NULL(vif) || iwl_mvm_vif_from_mac80211(vif)->esr_active) return; if (le32_to_cpu(notif->action) == ESR_RECOMMEND_ENTER) @@ -1912,12 +1912,10 @@ static bool iwl_mvm_set_hw_rfkill_state(struct iwl_op_mode *op_mode, bool state) bool rfkill_safe_init_done = READ_ONCE(mvm->rfkill_safe_init_done); bool unified = iwl_mvm_has_unified_ucode(mvm); - if (state) { + if (state) set_bit(IWL_MVM_STATUS_HW_RFKILL, &mvm->status); - wake_up(&mvm->rx_sync_waitq); - } else { + else clear_bit(IWL_MVM_STATUS_HW_RFKILL, &mvm->status); - } iwl_mvm_set_rfkill_state(mvm); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c index 4fa8066a89b6..6e933907f985 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c @@ -557,12 +557,10 @@ struct iwl_mvm_stat_data_all_macs { }; static void iwl_mvm_update_link_sig(struct ieee80211_vif *vif, int sig, - struct iwl_mvm_vif_link_info *link_info) + struct iwl_mvm_vif_link_info *link_info, + struct ieee80211_bss_conf *bss_conf) { struct iwl_mvm *mvm = iwl_mvm_vif_from_mac80211(vif)->mvm; - struct ieee80211_bss_conf *bss_conf = - iwl_mvm_rcu_fw_link_id_to_link_conf(mvm, link_info->fw_link_id, - false); int thold = bss_conf->cqm_rssi_thold; int hyst = bss_conf->cqm_rssi_hyst; int last_event; @@ -670,7 +668,7 @@ static void iwl_mvm_stat_iterator(void *_data, u8 *mac, mvmvif->deflink.beacon_stats.num_beacons; /* This is used in pre-MLO API so use deflink */ - iwl_mvm_update_link_sig(vif, sig, &mvmvif->deflink); + iwl_mvm_update_link_sig(vif, sig, &mvmvif->deflink, &vif->bss_conf); } static void iwl_mvm_stat_iterator_all_macs(void *_data, u8 *mac, @@ -705,7 +703,7 @@ static void iwl_mvm_stat_iterator_all_macs(void *_data, u8 *mac, sig = -le32_to_cpu(mac_stats->beacon_filter_average_energy); /* This is used in pre-MLO API so use deflink */ - iwl_mvm_update_link_sig(vif, sig, &mvmvif->deflink); + iwl_mvm_update_link_sig(vif, sig, &mvmvif->deflink, &vif->bss_conf); } static inline void @@ -921,7 +919,8 @@ iwl_mvm_stat_iterator_all_links(struct iwl_mvm *mvm, mvmvif->link[link_id]->beacon_stats.num_beacons; sig = -le32_to_cpu(link_stats->beacon_filter_average_energy); - iwl_mvm_update_link_sig(bss_conf->vif, sig, link_info); + iwl_mvm_update_link_sig(bss_conf->vif, sig, link_info, + bss_conf); if (WARN_ONCE(mvmvif->id >= MAC_INDEX_AUX, "invalid mvmvif id: %d", mvmvif->id)) @@ -967,7 +966,7 @@ static void iwl_mvm_update_esr_mode_tpt(struct iwl_mvm *mvm) lockdep_assert_held(&mvm->mutex); - if (!bss_vif) + if (IS_ERR_OR_NULL(bss_vif)) return; mvmvif = iwl_mvm_vif_from_mac80211(bss_vif); diff --git a/drivers/net/wireless/microchip/wilc1000/hif.c b/drivers/net/wireless/microchip/wilc1000/hif.c index f1085ccb7eed..7719e4f3e2a2 100644 --- a/drivers/net/wireless/microchip/wilc1000/hif.c +++ b/drivers/net/wireless/microchip/wilc1000/hif.c @@ -382,7 +382,8 @@ wilc_parse_join_bss_param(struct cfg80211_bss *bss, struct ieee80211_p2p_noa_attr noa_attr; const struct cfg80211_bss_ies *ies; struct wilc_join_bss_param *param; - u8 rates_len = 0, ies_len; + u8 rates_len = 0; + int ies_len; int ret; param = kzalloc(sizeof(*param), GFP_KERNEL); diff --git a/drivers/net/wireless/ti/wlcore/cmd.c b/drivers/net/wireless/ti/wlcore/cmd.c index a939fd89a7f5..92fc2d456c2c 100644 --- a/drivers/net/wireless/ti/wlcore/cmd.c +++ b/drivers/net/wireless/ti/wlcore/cmd.c @@ -1566,13 +1566,6 @@ int wl12xx_cmd_add_peer(struct wl1271 *wl, struct wl12xx_vif *wlvif, cpu_to_le32(wl1271_tx_enabled_rates_get(wl, sta_rates, wlvif->band)); - if (!cmd->supported_rates) { - wl1271_debug(DEBUG_CMD, - "peer has no supported rates yet, configuring basic rates: 0x%x", - wlvif->basic_rate_set); - cmd->supported_rates = cpu_to_le32(wlvif->basic_rate_set); - } - wl1271_debug(DEBUG_CMD, "new peer rates=0x%x queues=0x%x", cmd->supported_rates, sta->uapsd_queues); diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index ef12169f8044..492cd7aef44f 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -5139,19 +5139,23 @@ static int wl12xx_update_sta_state(struct wl1271 *wl, /* Add station (AP mode) */ if (is_ap && - old_state == IEEE80211_STA_NOTEXIST && - new_state == IEEE80211_STA_NONE) { + old_state == IEEE80211_STA_AUTH && + new_state == IEEE80211_STA_ASSOC) { ret = wl12xx_sta_add(wl, wlvif, sta); if (ret) return ret; + wl_sta->fw_added = true; + wlcore_update_inconn_sta(wl, wlvif, wl_sta, true); } /* Remove station (AP mode) */ if (is_ap && - old_state == IEEE80211_STA_NONE && - new_state == IEEE80211_STA_NOTEXIST) { + old_state == IEEE80211_STA_ASSOC && + new_state == IEEE80211_STA_AUTH) { + wl_sta->fw_added = false; + /* must not fail */ wl12xx_sta_remove(wl, wlvif, sta); @@ -5165,11 +5169,6 @@ static int wl12xx_update_sta_state(struct wl1271 *wl, if (ret < 0) return ret; - /* reconfigure rates */ - ret = wl12xx_cmd_add_peer(wl, wlvif, sta, wl_sta->hlid); - if (ret < 0) - return ret; - ret = wl1271_acx_set_ht_capabilities(wl, &sta->deflink.ht_cap, true, wl_sta->hlid); diff --git a/drivers/net/wireless/ti/wlcore/tx.c b/drivers/net/wireless/ti/wlcore/tx.c index 7bd3ce2f0804..464587d16ab2 100644 --- a/drivers/net/wireless/ti/wlcore/tx.c +++ b/drivers/net/wireless/ti/wlcore/tx.c @@ -140,11 +140,8 @@ EXPORT_SYMBOL(wl12xx_is_dummy_packet); static u8 wl12xx_tx_get_hlid_ap(struct wl1271 *wl, struct wl12xx_vif *wlvif, struct sk_buff *skb, struct ieee80211_sta *sta) { - if (sta) { - struct wl1271_station *wl_sta; - - wl_sta = (struct wl1271_station *)sta->drv_priv; - return wl_sta->hlid; + if (sta && wl1271_station(sta)->fw_added) { + return wl1271_station(sta)->hlid; } else { struct ieee80211_hdr *hdr; diff --git a/drivers/net/wireless/ti/wlcore/wlcore_i.h b/drivers/net/wireless/ti/wlcore/wlcore_i.h index eefae3f867b9..817a8a61cac6 100644 --- a/drivers/net/wireless/ti/wlcore/wlcore_i.h +++ b/drivers/net/wireless/ti/wlcore/wlcore_i.h @@ -324,6 +324,7 @@ struct wl12xx_rx_filter { struct wl1271_station { u8 hlid; + bool fw_added; bool in_connection; /* @@ -335,6 +336,11 @@ struct wl1271_station { u64 total_freed_pkts; }; +static inline struct wl1271_station *wl1271_station(struct ieee80211_sta *sta) +{ + return (struct wl1271_station *)sta->drv_priv; +} + struct wl12xx_vif { struct wl1271 *wl; struct list_head list; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 1e5aedaf8c7b..e79c06d65bb7 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1501,9 +1501,15 @@ static int btt_blk_init(struct btt *btt) .logical_block_size = btt->sector_size, .max_hw_sectors = UINT_MAX, .max_integrity_segments = 1, + .features = BLK_FEAT_SYNCHRONOUS, }; int rc; + if (btt_meta_size(btt) && IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { + lim.integrity.tuple_size = btt_meta_size(btt); + lim.integrity.tag_size = btt_meta_size(btt); + } + btt->btt_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(btt->btt_disk)) return PTR_ERR(btt->btt_disk); @@ -1513,17 +1519,6 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->fops = &btt_fops; btt->btt_disk->private_data = btt; - blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); - - if (btt_meta_size(btt) && IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { - struct blk_integrity bi = { - .tuple_size = btt_meta_size(btt), - .tag_size = btt_meta_size(btt), - }; - blk_integrity_register(btt->btt_disk, &bi); - } - set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); rc = device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL); if (rc) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 598fe2e89bda..1dd74c969d5a 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -455,6 +455,8 @@ static int pmem_attach_disk(struct device *dev, .logical_block_size = pmem_sector_size(ndns), .physical_block_size = PAGE_SIZE, .max_hw_sectors = UINT_MAX, + .features = BLK_FEAT_WRITE_CACHE | + BLK_FEAT_SYNCHRONOUS, }; int nid = dev_to_node(dev), fua; struct resource *res = &nsio->res; @@ -463,7 +465,6 @@ static int pmem_attach_disk(struct device *dev, struct dax_device *dax_dev; struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; - struct request_queue *q; struct gendisk *disk; void *addr; int rc; @@ -495,6 +496,10 @@ static int pmem_attach_disk(struct device *dev, dev_warn(dev, "unable to guarantee persistence of writes\n"); fua = 0; } + if (fua) + lim.features |= BLK_FEAT_FUA; + if (is_nd_pfn(dev)) + lim.features |= BLK_FEAT_DAX; if (!devm_request_mem_region(dev, res->start, resource_size(res), dev_name(&ndns->dev))) { @@ -505,7 +510,6 @@ static int pmem_attach_disk(struct device *dev, disk = blk_alloc_disk(&lim, nid); if (IS_ERR(disk)) return PTR_ERR(disk); - q = disk->queue; pmem->disk = disk; pmem->pgmap.owner = pmem; @@ -543,12 +547,6 @@ static int pmem_attach_disk(struct device *dev, } pmem->virt_addr = addr; - blk_queue_write_cache(q, true, fua); - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); - if (pmem->pfn_flags & PFN_MAP) - blk_queue_flag_set(QUEUE_FLAG_DAX, q); - disk->fops = &pmem_fops; disk->private_data = pmem; nvdimm_namespace_disk_name(ndns, disk->disk_name); diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index b309c8be720f..a3caef75aa0a 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config NVME_CORE tristate - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY config BLK_DEV_NVME tristate "NVM Express block device" diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 0cfa39361d3b..b1387dc459a3 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1388,7 +1388,7 @@ static void devm_apple_nvme_mempool_destroy(void *data) mempool_destroy(data); } -static int apple_nvme_probe(struct platform_device *pdev) +static struct apple_nvme *apple_nvme_alloc(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct apple_nvme *anv; @@ -1396,7 +1396,7 @@ static int apple_nvme_probe(struct platform_device *pdev) anv = devm_kzalloc(dev, sizeof(*anv), GFP_KERNEL); if (!anv) - return -ENOMEM; + return ERR_PTR(-ENOMEM); anv->dev = get_device(dev); anv->adminq.is_adminq = true; @@ -1516,10 +1516,30 @@ static int apple_nvme_probe(struct platform_device *pdev) goto put_dev; } + return anv; +put_dev: + put_device(anv->dev); + return ERR_PTR(ret); +} + +static int apple_nvme_probe(struct platform_device *pdev) +{ + struct apple_nvme *anv; + int ret; + + anv = apple_nvme_alloc(pdev); + if (IS_ERR(anv)) + return PTR_ERR(anv); + + ret = nvme_add_ctrl(&anv->ctrl); + if (ret) + goto out_put_ctrl; + anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL); if (IS_ERR(anv->ctrl.admin_q)) { ret = -ENOMEM; - goto put_dev; + anv->ctrl.admin_q = NULL; + goto out_uninit_ctrl; } nvme_reset_ctrl(&anv->ctrl); @@ -1527,8 +1547,10 @@ static int apple_nvme_probe(struct platform_device *pdev) return 0; -put_dev: - put_device(anv->dev); +out_uninit_ctrl: + nvme_uninit_ctrl(&anv->ctrl); +out_put_ctrl: + nvme_put_ctrl(&anv->ctrl); return ret; } diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index 6f2ebb5fcdb0..2b9e6cfaf2a8 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -173,7 +173,7 @@ static const char * const nvme_statuses[] = { const char *nvme_get_error_status_str(u16 status) { - status &= 0x7ff; + status &= NVME_SCT_SC_MASK; if (status < ARRAY_SIZE(nvme_statuses) && nvme_statuses[status]) return nvme_statuses[status]; return "Unknown"; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 782090ce0bc1..19917253ba7b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq; EXPORT_SYMBOL_GPL(nvme_delete_wq); static LIST_HEAD(nvme_subsystems); -static DEFINE_MUTEX(nvme_subsystems_lock); +DEFINE_MUTEX(nvme_subsystems_lock); static DEFINE_IDA(nvme_instance_ida); static dev_t nvme_ctrl_base_chr_devt; @@ -261,7 +261,7 @@ void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) static blk_status_t nvme_error_status(u16 status) { - switch (status & 0x7ff) { + switch (status & NVME_SCT_SC_MASK) { case NVME_SC_SUCCESS: return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: @@ -307,7 +307,7 @@ static void nvme_retry_req(struct request *req) u16 crd; /* The mask and shift result must be <= 3 */ - crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; + crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11; if (crd) delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100; @@ -329,10 +329,10 @@ static void nvme_log_error(struct request *req) nvme_sect_to_lba(ns->head, blk_rq_pos(req)), blk_rq_bytes(req) >> ns->head->lba_shift, nvme_get_error_status_str(nr->status), - nr->status >> 8 & 7, /* Status Code Type */ - nr->status & 0xff, /* Status Code */ - nr->status & NVME_SC_MORE ? "MORE " : "", - nr->status & NVME_SC_DNR ? "DNR " : ""); + NVME_SCT(nr->status), /* Status Code Type */ + nr->status & NVME_SC_MASK, /* Status Code */ + nr->status & NVME_STATUS_MORE ? "MORE " : "", + nr->status & NVME_STATUS_DNR ? "DNR " : ""); return; } @@ -341,10 +341,10 @@ static void nvme_log_error(struct request *req) nvme_get_admin_opcode_str(nr->cmd->common.opcode), nr->cmd->common.opcode, nvme_get_error_status_str(nr->status), - nr->status >> 8 & 7, /* Status Code Type */ - nr->status & 0xff, /* Status Code */ - nr->status & NVME_SC_MORE ? "MORE " : "", - nr->status & NVME_SC_DNR ? "DNR " : ""); + NVME_SCT(nr->status), /* Status Code Type */ + nr->status & NVME_SC_MASK, /* Status Code */ + nr->status & NVME_STATUS_MORE ? "MORE " : "", + nr->status & NVME_STATUS_DNR ? "DNR " : ""); } static void nvme_log_err_passthru(struct request *req) @@ -359,10 +359,10 @@ static void nvme_log_err_passthru(struct request *req) nvme_get_admin_opcode_str(nr->cmd->common.opcode), nr->cmd->common.opcode, nvme_get_error_status_str(nr->status), - nr->status >> 8 & 7, /* Status Code Type */ - nr->status & 0xff, /* Status Code */ - nr->status & NVME_SC_MORE ? "MORE " : "", - nr->status & NVME_SC_DNR ? "DNR " : "", + NVME_SCT(nr->status), /* Status Code Type */ + nr->status & NVME_SC_MASK, /* Status Code */ + nr->status & NVME_STATUS_MORE ? "MORE " : "", + nr->status & NVME_STATUS_DNR ? "DNR " : "", nr->cmd->common.cdw10, nr->cmd->common.cdw11, nr->cmd->common.cdw12, @@ -384,11 +384,11 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req) return COMPLETE; if (blk_noretry_request(req) || - (nvme_req(req)->status & NVME_SC_DNR) || + (nvme_req(req)->status & NVME_STATUS_DNR) || nvme_req(req)->retries >= nvme_max_retries) return COMPLETE; - if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED) + if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED) return AUTHENTICATE; if (req->cmd_flags & REQ_NVME_MPATH) { @@ -927,6 +927,36 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, return BLK_STS_OK; } +/* + * NVMe does not support a dedicated command to issue an atomic write. A write + * which does adhere to the device atomic limits will silently be executed + * non-atomically. The request issuer should ensure that the write is within + * the queue atomic writes limits, but just validate this in case it is not. + */ +static bool nvme_valid_atomic_write(struct request *req) +{ + struct request_queue *q = req->q; + u32 boundary_bytes = queue_atomic_write_boundary_bytes(q); + + if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q)) + return false; + + if (boundary_bytes) { + u64 mask = boundary_bytes - 1, imask = ~mask; + u64 start = blk_rq_pos(req) << SECTOR_SHIFT; + u64 end = start + blk_rq_bytes(req) - 1; + + /* If greater then must be crossing a boundary */ + if (blk_rq_bytes(req) > boundary_bytes) + return false; + + if ((start & imask) != (end & imask)) + return false; + } + + return true; +} + static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, enum nvme_opcode op) @@ -942,6 +972,9 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req)) + return BLK_STS_INVAL; + cmnd->rw.opcode = op; cmnd->rw.flags = 0; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); @@ -1224,7 +1257,7 @@ EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU); /* * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: - * + * * The host should send Keep Alive commands at half of the Keep Alive Timeout * accounting for transport roundtrip times [..]. */ @@ -1724,11 +1757,12 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) +static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head, + struct queue_limits *lim) { - struct blk_integrity integrity = { }; + struct blk_integrity *bi = &lim->integrity; - blk_integrity_unregister(disk); + memset(bi, 0, sizeof(*bi)); if (!head->ms) return true; @@ -1745,17 +1779,16 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE3: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.profile = &t10_pi_type3_crc; - integrity.tag_size = sizeof(u16) + sizeof(u32); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; + bi->tag_size = sizeof(u16) + sizeof(u32); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; case NVME_NVM_NS_64B_GUARD: - integrity.profile = &ext_pi_type3_crc64; - integrity.tag_size = sizeof(u16) + 6; - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC64; + bi->tag_size = sizeof(u16) + 6; + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; default: - integrity.profile = NULL; break; } break; @@ -1763,28 +1796,27 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE2: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.profile = &t10_pi_type1_crc; - integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; + bi->tag_size = sizeof(u16); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; case NVME_NVM_NS_64B_GUARD: - integrity.profile = &ext_pi_type1_crc64; - integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC64; + bi->tag_size = sizeof(u16); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; default: - integrity.profile = NULL; break; } break; default: - integrity.profile = NULL; break; } - integrity.tuple_size = head->ms; - integrity.pi_offset = head->pi_offset; - blk_integrity_register(disk, &integrity); + bi->tuple_size = head->ms; + bi->pi_offset = head->pi_offset; return true; } @@ -1922,6 +1954,23 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, } } + +static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, + struct nvme_id_ns *id, struct queue_limits *lim, + u32 bs, u32 atomic_bs) +{ + unsigned int boundary = 0; + + if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { + if (le16_to_cpu(id->nabspf)) + boundary = (le16_to_cpu(id->nabspf) + 1) * bs; + } + lim->atomic_write_hw_max = atomic_bs; + lim->atomic_write_hw_boundary = boundary; + lim->atomic_write_hw_unit_min = bs; + lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); +} + static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) { return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1; @@ -1968,13 +2017,16 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + + nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs); } if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { /* NPWG = Namespace Preferred Write Granularity */ phys_bs = bs * (1 + le16_to_cpu(id->npwg)); /* NOWS = Namespace Optimal Write Size */ - io_opt = bs * (1 + le16_to_cpu(id->nows)); + if (id->nows) + io_opt = bs * (1 + le16_to_cpu(id->nows)); } /* @@ -2058,7 +2110,6 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { - bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; struct queue_limits lim; struct nvme_id_ns_nvm *nvm = NULL; struct nvme_zone_info zi = {}; @@ -2107,11 +2158,11 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) nvme_update_zone_info(ns, &lim, &zi); - ret = queue_limits_commit_update(ns->disk->queue, &lim); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } + + if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) + lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; + else + lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); /* * Register a metadata profile for PI, or the plain non-integrity NVMe @@ -2119,9 +2170,15 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, * I/O to namespaces with metadata except when the namespace supports * PI, as it can strip/insert in that case. */ - if (!nvme_init_integrity(ns->disk, ns->head)) + if (!nvme_init_integrity(ns->disk, ns->head, &lim)) capacity = 0; + ret = queue_limits_commit_update(ns->disk->queue, &lim); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } + set_capacity_and_notify(ns->disk, capacity); /* @@ -2133,7 +2190,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) ns->head->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); - blk_queue_write_cache(ns->disk->queue, vwc, vwc); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); @@ -2193,14 +2249,6 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) struct queue_limits lim; blk_mq_freeze_queue(ns->head->disk->queue); - if (unsupported) - ns->head->disk->flags |= GENHD_FL_HIDDEN; - else - nvme_init_integrity(ns->head->disk, ns->head); - set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); - nvme_mpath_revalidate_paths(ns); - /* * queue_limits mixes values that are the hardware limitations * for bio splitting with what is the device configuration. @@ -2223,13 +2271,48 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) lim.io_opt = ns_lim->io_opt; queue_limits_stack_bdev(&lim, ns->disk->part0, 0, ns->head->disk->disk_name); + if (unsupported) + ns->head->disk->flags |= GENHD_FL_HIDDEN; + else + nvme_init_integrity(ns->head->disk, ns->head, &lim); ret = queue_limits_commit_update(ns->head->disk->queue, &lim); + + set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); + set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); + nvme_mpath_revalidate_paths(ns); + blk_mq_unfreeze_queue(ns->head->disk->queue); } return ret; } +int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16], + enum blk_unique_id type) +{ + struct nvme_ns_ids *ids = &ns->head->ids; + + if (type != BLK_UID_EUI64) + return -EINVAL; + + if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) { + memcpy(id, &ids->nguid, sizeof(ids->nguid)); + return sizeof(ids->nguid); + } + if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) { + memcpy(id, &ids->eui64, sizeof(ids->eui64)); + return sizeof(ids->eui64); + } + + return -EINVAL; +} + +static int nvme_get_unique_id(struct gendisk *disk, u8 id[16], + enum blk_unique_id type) +{ + return nvme_ns_get_unique_id(disk->private_data, id, type); +} + #ifdef CONFIG_BLK_SED_OPAL static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) @@ -2285,6 +2368,7 @@ const struct block_device_operations nvme_bdev_ops = { .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, + .get_unique_id = nvme_get_unique_id, .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; @@ -3721,6 +3805,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) { + struct queue_limits lim = { }; struct nvme_ns *ns; struct gendisk *disk; int node = ctrl->numa_node; @@ -3729,7 +3814,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (!ns) return; - disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns); + if (ctrl->opts && ctrl->opts->data_digest) + lim.features |= BLK_FEAT_STABLE_WRITES; + if (ctrl->ops->supports_pci_p2pdma && + ctrl->ops->supports_pci_p2pdma(ctrl)) + lim.features |= BLK_FEAT_PCI_P2PDMA; + + disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns); if (IS_ERR(disk)) goto out_free_ns; disk->fops = &nvme_bdev_ops; @@ -3737,15 +3828,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) ns->disk = disk; ns->queue = disk->queue; - - if (ctrl->opts && ctrl->opts->data_digest) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); - - blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); - if (ctrl->ops->supports_pci_p2pdma && - ctrl->ops->supports_pci_p2pdma(ctrl)) - blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); - ns->ctrl = ctrl; kref_init(&ns->kref); @@ -3887,7 +3969,7 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info) { - int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; + int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR; if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) { dev_err(ns->ctrl->device, @@ -3903,7 +3985,7 @@ out: * * TODO: we should probably schedule a delayed retry here. */ - if (ret > 0 && (ret & NVME_SC_DNR)) + if (ret > 0 && (ret & NVME_STATUS_DNR)) nvme_ns_remove(ns); } @@ -4095,7 +4177,7 @@ static void nvme_scan_work(struct work_struct *work) * they report) but don't actually support it. */ ret = nvme_scan_ns_list(ctrl); - if (ret > 0 && ret & NVME_SC_DNR) + if (ret > 0 && ret & NVME_STATUS_DNR) nvme_scan_ns_sequential(ctrl); } mutex_unlock(&ctrl->scan_lock); @@ -4489,13 +4571,15 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, return ret; if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL); + struct queue_limits lim = { + .features = BLK_FEAT_SKIP_TAGSET_QUIESCE, + }; + + ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL); if (IS_ERR(ctrl->connect_q)) { ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } - blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, - ctrl->connect_q); } ctrl->tagset = set; @@ -4613,6 +4697,9 @@ static void nvme_free_ctrl(struct device *dev) * Initialize a NVMe controller structures. This needs to be called during * earliest initialization so that we have the initialized structured around * during probing. + * + * On success, the caller must use the nvme_put_ctrl() to release this when + * needed, which also invokes the ops->free_ctrl() callback. */ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks) @@ -4661,6 +4748,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, goto out; ctrl->instance = ret; + ret = nvme_auth_init_ctrl(ctrl); + if (ret) + goto out_release_instance; + + nvme_mpath_init_ctrl(ctrl); + device_initialize(&ctrl->ctrl_device); ctrl->device = &ctrl->ctrl_device; ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), @@ -4673,16 +4766,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->device->groups = nvme_dev_attr_groups; ctrl->device->release = nvme_free_ctrl; dev_set_drvdata(ctrl->device, ctrl); + + return ret; + +out_release_instance: + ida_free(&nvme_instance_ida, ctrl->instance); +out: + if (ctrl->discard_page) + __free_page(ctrl->discard_page); + cleanup_srcu_struct(&ctrl->srcu); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_init_ctrl); + +/* + * On success, returns with an elevated controller reference and caller must + * use nvme_uninit_ctrl() to properly free resources associated with the ctrl. + */ +int nvme_add_ctrl(struct nvme_ctrl *ctrl) +{ + int ret; + ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); if (ret) - goto out_release_instance; + return ret; - nvme_get_ctrl(ctrl); cdev_init(&ctrl->cdev, &nvme_dev_fops); - ctrl->cdev.owner = ops->module; + ctrl->cdev.owner = ctrl->ops->module; ret = cdev_device_add(&ctrl->cdev, ctrl->device); if (ret) - goto out_free_name; + return ret; /* * Initialize latency tolerance controls. The sysfs files won't @@ -4693,28 +4806,11 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, min(default_ps_max_latency_us, (unsigned long)S32_MAX)); nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); - nvme_mpath_init_ctrl(ctrl); - ret = nvme_auth_init_ctrl(ctrl); - if (ret) - goto out_free_cdev; + nvme_get_ctrl(ctrl); return 0; -out_free_cdev: - nvme_fault_inject_fini(&ctrl->fault_inject); - dev_pm_qos_hide_latency_tolerance(ctrl->device); - cdev_device_del(&ctrl->cdev, ctrl->device); -out_free_name: - nvme_put_ctrl(ctrl); - kfree_const(ctrl->device->kobj.name); -out_release_instance: - ida_free(&nvme_instance_ida, ctrl->instance); -out: - if (ctrl->discard_page) - __free_page(ctrl->discard_page); - cleanup_srcu_struct(&ctrl->srcu); - return ret; } -EXPORT_SYMBOL_GPL(nvme_init_ctrl); +EXPORT_SYMBOL_GPL(nvme_add_ctrl); /* let I/O to all namespaces fail in preparation for surprise removal */ void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index ceb9c0ed3120..44e342a46f39 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -187,7 +187,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) if (unlikely(ret != 0)) dev_err(ctrl->device, "Property Get error: %d, offset %#x\n", - ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + ret > 0 ? ret & ~NVME_STATUS_DNR : ret, off); return ret; } @@ -233,7 +233,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) if (unlikely(ret != 0)) dev_err(ctrl->device, "Property Get error: %d, offset %#x\n", - ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + ret > 0 ? ret & ~NVME_STATUS_DNR : ret, off); return ret; } EXPORT_SYMBOL_GPL(nvmf_reg_read64); @@ -275,11 +275,26 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) if (unlikely(ret)) dev_err(ctrl->device, "Property Set error: %d, offset %#x\n", - ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + ret > 0 ? ret & ~NVME_STATUS_DNR : ret, off); return ret; } EXPORT_SYMBOL_GPL(nvmf_reg_write32); +int nvmf_subsystem_reset(struct nvme_ctrl *ctrl) +{ + int ret; + + if (!nvme_wait_reset(ctrl)) + return -EBUSY; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, NVME_SUBSYS_RESET); + if (ret) + return ret; + + return nvme_try_sched_reset(ctrl); +} +EXPORT_SYMBOL_GPL(nvmf_subsystem_reset); + /** * nvmf_log_connect_error() - Error-parsing-diagnostic print out function for * connect() errors. @@ -295,7 +310,7 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, int errval, int offset, struct nvme_command *cmd, struct nvmf_connect_data *data) { - int err_sctype = errval & ~NVME_SC_DNR; + int err_sctype = errval & ~NVME_STATUS_DNR; if (errval < 0) { dev_err(ctrl->device, @@ -573,7 +588,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); */ bool nvmf_should_reconnect(struct nvme_ctrl *ctrl, int status) { - if (status > 0 && (status & NVME_SC_DNR)) + if (status > 0 && (status & NVME_STATUS_DNR)) return false; if (status == -EKEYREJECTED) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 602135910ae9..21d75dc4a3a0 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -217,6 +217,7 @@ static inline unsigned int nvmf_nr_io_queues(struct nvmf_ctrl_options *opts) int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); +int nvmf_subsystem_reset(struct nvme_ctrl *ctrl); int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); int nvmf_register_transport(struct nvmf_transport_ops *ops); diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 1ba10a5c656d..1d1b6441a339 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -75,7 +75,7 @@ void nvme_should_fail(struct request *req) /* inject status code and DNR bit */ status = fault_inject->status; if (fault_inject->dont_retry) - status |= NVME_SC_DNR; + status |= NVME_STATUS_DNR; nvme_req(req)->status = status; } } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f0b081332749..b81af7919e94 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3132,7 +3132,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ctrl->ctrl.icdoff) { dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n", ctrl->ctrl.icdoff); - ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + ret = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out_stop_keep_alive; } @@ -3140,7 +3140,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (!nvme_ctrl_sgl_supported(&ctrl->ctrl)) { dev_err(ctrl->ctrl.device, "Mandatory sgls are not supported!\n"); - ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + ret = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out_stop_keep_alive; } @@ -3325,7 +3325,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay); } else { if (portptr->port_state == FC_OBJSTATE_ONLINE) { - if (status > 0 && (status & NVME_SC_DNR)) + if (status > 0 && (status & NVME_STATUS_DNR)) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: reconnect failure\n", ctrl->cnum); @@ -3382,6 +3382,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, + .subsystem_reset = nvmf_subsystem_reset, .free_ctrl = nvme_fc_free_ctrl, .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_delete_ctrl, @@ -3444,12 +3445,11 @@ nvme_fc_existing_controller(struct nvme_fc_rport *rport, return found; } -static struct nvme_ctrl * -nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, +static struct nvme_fc_ctrl * +nvme_fc_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, struct nvme_fc_lport *lport, struct nvme_fc_rport *rport) { struct nvme_fc_ctrl *ctrl; - unsigned long flags; int ret, idx, ctrl_loss_tmo; if (!(rport->remoteport.port_role & @@ -3538,7 +3538,35 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, if (lport->dev) ctrl->ctrl.numa_node = dev_to_node(lport->dev); - /* at this point, teardown path changes to ref counting on nvme ctrl */ + return ctrl; + +out_free_queues: + kfree(ctrl->queues); +out_free_ida: + put_device(ctrl->dev); + ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum); +out_free_ctrl: + kfree(ctrl); +out_fail: + /* exit via here doesn't follow ctlr ref points */ + return ERR_PTR(ret); +} + +static struct nvme_ctrl * +nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, + struct nvme_fc_lport *lport, struct nvme_fc_rport *rport) +{ + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + int ret; + + ctrl = nvme_fc_alloc_ctrl(dev, opts, lport, rport); + if (IS_ERR(ctrl)) + return ERR_CAST(ctrl); + + ret = nvme_add_ctrl(&ctrl->ctrl); + if (ret) + goto out_put_ctrl; ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, &nvme_fc_admin_mq_ops, @@ -3584,6 +3612,7 @@ fail_ctrl: /* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: /* Remove core ctrl ref. */ nvme_put_ctrl(&ctrl->ctrl); @@ -3597,20 +3626,8 @@ fail_ctrl: nvme_fc_rport_get(rport); return ERR_PTR(-EIO); - -out_free_queues: - kfree(ctrl->queues); -out_free_ida: - put_device(ctrl->dev); - ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum); -out_free_ctrl: - kfree(ctrl); -out_fail: - /* exit via here doesn't follow ctlr ref points */ - return ERR_PTR(ret); } - struct nvmet_fc_traddr { u64 nn; u64 pn; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d8b6b4648eaf..91d9eb3c22ef 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath, static const char *nvme_iopolicy_names[] = { [NVME_IOPOLICY_NUMA] = "numa", [NVME_IOPOLICY_RR] = "round-robin", + [NVME_IOPOLICY_QD] = "queue-depth", }; static int iopolicy = NVME_IOPOLICY_NUMA; @@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp) iopolicy = NVME_IOPOLICY_NUMA; else if (!strncmp(val, "round-robin", 11)) iopolicy = NVME_IOPOLICY_RR; + else if (!strncmp(val, "queue-depth", 11)) + iopolicy = NVME_IOPOLICY_QD; else return -EINVAL; @@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp) module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy, &iopolicy, 0644); MODULE_PARM_DESC(iopolicy, - "Default multipath I/O policy; 'numa' (default) or 'round-robin'"); + "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'"); void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) { @@ -83,7 +86,7 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; - u16 status = nvme_req(req)->status & 0x7ff; + u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK; unsigned long flags; struct bio *bio; @@ -128,6 +131,11 @@ void nvme_mpath_start_request(struct request *rq) struct nvme_ns *ns = rq->q->queuedata; struct gendisk *disk = ns->head->disk; + if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) { + atomic_inc(&ns->ctrl->nr_active); + nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE; + } + if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq)) return; @@ -141,6 +149,9 @@ void nvme_mpath_end_request(struct request *rq) { struct nvme_ns *ns = rq->q->queuedata; + if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE) + atomic_dec_if_positive(&ns->ctrl->nr_active); + if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS)) return; bdev_end_io_acct(ns->head->disk->part0, req_op(rq), @@ -291,10 +302,15 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); } -static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, - int node, struct nvme_ns *old) +static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head) { struct nvme_ns *ns, *found = NULL; + int node = numa_node_id(); + struct nvme_ns *old = srcu_dereference(head->current_path[node], + &head->srcu); + + if (unlikely(!old)) + return __nvme_find_path(head, node); if (list_is_singular(&head->list)) { if (nvme_path_is_disabled(old)) @@ -334,13 +350,49 @@ out: return found; } +static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head) +{ + struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns; + unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX; + unsigned int depth; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (nvme_path_is_disabled(ns)) + continue; + + depth = atomic_read(&ns->ctrl->nr_active); + + switch (ns->ana_state) { + case NVME_ANA_OPTIMIZED: + if (depth < min_depth_opt) { + min_depth_opt = depth; + best_opt = ns; + } + break; + case NVME_ANA_NONOPTIMIZED: + if (depth < min_depth_nonopt) { + min_depth_nonopt = depth; + best_nonopt = ns; + } + break; + default: + break; + } + + if (min_depth_opt == 0) + return best_opt; + } + + return best_opt ? best_opt : best_nonopt; +} + static inline bool nvme_path_is_optimized(struct nvme_ns *ns) { return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE && ns->ana_state == NVME_ANA_OPTIMIZED; } -inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) +static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head) { int node = numa_node_id(); struct nvme_ns *ns; @@ -348,14 +400,23 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) ns = srcu_dereference(head->current_path[node], &head->srcu); if (unlikely(!ns)) return __nvme_find_path(head, node); - - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) - return nvme_round_robin_path(head, node, ns); if (unlikely(!nvme_path_is_optimized(ns))) return __nvme_find_path(head, node); return ns; } +inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) +{ + switch (READ_ONCE(head->subsys->iopolicy)) { + case NVME_IOPOLICY_QD: + return nvme_queue_depth_path(head); + case NVME_IOPOLICY_RR: + return nvme_round_robin_path(head); + default: + return nvme_numa_path(head); + } +} + static bool nvme_available_path(struct nvme_ns_head *head) { struct nvme_ns *ns; @@ -427,6 +488,21 @@ static void nvme_ns_head_release(struct gendisk *disk) nvme_put_ns_head(disk->private_data); } +static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16], + enum blk_unique_id type) +{ + struct nvme_ns_head *head = disk->private_data; + struct nvme_ns *ns; + int srcu_idx, ret = -EWOULDBLOCK; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (ns) + ret = nvme_ns_get_unique_id(ns, id, type); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + #ifdef CONFIG_BLK_DEV_ZONED static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) @@ -454,6 +530,7 @@ const struct block_device_operations nvme_ns_head_ops = { .ioctl = nvme_ns_head_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl, .getgeo = nvme_getgeo, + .get_unique_id = nvme_ns_head_get_unique_id, .report_zones = nvme_ns_head_report_zones, .pr_ops = &nvme_pr_ops, }; @@ -521,7 +598,6 @@ static void nvme_requeue_work(struct work_struct *work) int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { struct queue_limits lim; - bool vwc = false; mutex_init(&head->lock); bio_list_init(&head->requeue_list); @@ -539,6 +615,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_set_stacking_limits(&lim); lim.dma_alignment = 3; + lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; if (head->ids.csi != NVME_CSI_ZNS) lim.max_zone_append_sectors = 0; @@ -549,24 +626,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) head->disk->private_data = head; sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); - - blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue); - /* - * This assumes all controllers that refer to a namespace either - * support poll queues or not. That is not a strict guarantee, - * but if the assumption is wrong the effect is only suboptimal - * performance but not correctness problem. - */ - if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && - ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); - - /* we need to propagate up the VMC settings */ - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - vwc = true; - blk_queue_write_cache(head->disk->queue, vwc, vwc); return 0; } @@ -803,6 +862,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev, nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); } +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, + int iopolicy) +{ + struct nvme_ctrl *ctrl; + int old_iopolicy = READ_ONCE(subsys->iopolicy); + + if (old_iopolicy == iopolicy) + return; + + WRITE_ONCE(subsys->iopolicy, iopolicy); + + /* iopolicy changes clear the mpath by design */ + mutex_lock(&nvme_subsystems_lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvme_mpath_clear_ctrl_paths(ctrl); + mutex_unlock(&nvme_subsystems_lock); + + pr_notice("subsysnqn %s iopolicy changed from %s to %s\n", + subsys->subnqn, + nvme_iopolicy_names[old_iopolicy], + nvme_iopolicy_names[iopolicy]); +} + static ssize_t nvme_subsys_iopolicy_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -812,7 +894,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev, for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { if (sysfs_streq(buf, nvme_iopolicy_names[i])) { - WRITE_ONCE(subsys->iopolicy, i); + nvme_subsys_iopolicy_update(subsys, i); return count; } } @@ -875,9 +957,6 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) nvme_mpath_set_live(ns); } - if (blk_queue_stable_writes(ns->queue) && ns->head->disk) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, - ns->head->disk->queue); #ifdef CONFIG_BLK_DEV_ZONED if (blk_queue_is_zoned(ns->queue) && ns->head->disk) ns->head->disk->nr_zones = ns->disk->nr_zones; @@ -923,6 +1002,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) return 0; + /* initialize this in the identify path to cover controller resets */ + atomic_set(&ctrl->nr_active, 0); + if (!ctrl->max_namespaces || ctrl->max_namespaces > le32_to_cpu(id->nn)) { dev_err(ctrl->device, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 68b400f9c42d..f900e44243ae 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -49,6 +49,7 @@ extern unsigned int admin_timeout; extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; +extern struct mutex nvme_subsystems_lock; /* * List of workarounds for devices that required behavior not specified in @@ -195,6 +196,7 @@ enum { NVME_REQ_CANCELLED = (1 << 0), NVME_REQ_USERCMD = (1 << 1), NVME_MPATH_IO_STATS = (1 << 2), + NVME_MPATH_CNT_ACTIVE = (1 << 3), }; static inline struct nvme_request *nvme_req(struct request *req) @@ -360,6 +362,7 @@ struct nvme_ctrl { size_t ana_log_size; struct timer_list anatt_timer; struct work_struct ana_work; + atomic_t nr_active; #endif #ifdef CONFIG_NVME_HOST_AUTH @@ -408,6 +411,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl) enum nvme_iopolicy { NVME_IOPOLICY_NUMA, NVME_IOPOLICY_RR, + NVME_IOPOLICY_QD, }; struct nvme_subsystem { @@ -551,6 +555,7 @@ struct nvme_ctrl_ops { int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl); + int (*subsystem_reset)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); void (*stop_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); @@ -649,18 +654,9 @@ int nvme_try_sched_reset(struct nvme_ctrl *ctrl); static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) { - int ret; - - if (!ctrl->subsystem) + if (!ctrl->subsystem || !ctrl->ops->subsystem_reset) return -ENOTTY; - if (!nvme_wait_reset(ctrl)) - return -EBUSY; - - ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); - if (ret) - return ret; - - return nvme_try_sched_reset(ctrl); + return ctrl->ops->subsystem_reset(ctrl); } /* @@ -689,7 +685,7 @@ static inline u32 nvme_bytes_to_numd(size_t len) static inline bool nvme_is_ana_error(u16 status) { - switch (status & 0x7ff) { + switch (status & NVME_SCT_SC_MASK) { case NVME_SC_ANA_TRANSITION: case NVME_SC_ANA_INACCESSIBLE: case NVME_SC_ANA_PERSISTENT_LOSS: @@ -702,7 +698,7 @@ static inline bool nvme_is_ana_error(u16 status) static inline bool nvme_is_path_error(u16 status) { /* check for a status code type of 'path related status' */ - return (status & 0x700) == 0x300; + return (status & NVME_SCT_MASK) == NVME_SCT_PATH; } /* @@ -792,6 +788,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown); int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); +int nvme_add_ctrl(struct nvme_ctrl *ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); @@ -877,7 +874,7 @@ enum { NVME_SUBMIT_NOWAIT = (__force nvme_submit_flags_t)(1 << 1), /* Set BLK_MQ_REQ_RESERVED when allocating request */ NVME_SUBMIT_RESERVED = (__force nvme_submit_flags_t)(1 << 2), - /* Retry command when NVME_SC_DNR is not set in the result */ + /* Retry command when NVME_STATUS_DNR is not set in the result */ NVME_SUBMIT_RETRY = (__force nvme_submit_flags_t)(1 << 3), }; @@ -1062,6 +1059,9 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) } #endif /* CONFIG_NVME_MULTIPATH */ +int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16], + enum blk_unique_id type); + struct nvme_zone_info { u64 zone_size; unsigned int max_open_zones; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 102a9fb0c65f..21f8e1b9801f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -826,9 +826,9 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct bio_vec bv = rq_integrity_vec(req); - iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req), - rq_dma_dir(req), 0); + iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); if (dma_mapping_error(dev->dev, iod->meta_dma)) return BLK_STS_IOERR; cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); @@ -967,7 +967,7 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req)->bv_len, rq_dma_dir(req)); + rq_integrity_vec(req).bv_len, rq_dma_dir(req)); } if (blk_rq_nr_phys_segments(req)) @@ -1143,6 +1143,41 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) spin_unlock(&nvmeq->sq_lock); } +static int nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + int ret = 0; + + /* + * Taking the shutdown_lock ensures the BAR mapping is not being + * altered by reset_work. Holding this lock before the RESETTING state + * change, if successful, also ensures nvme_remove won't be able to + * proceed to iounmap until we're done. + */ + mutex_lock(&dev->shutdown_lock); + if (!dev->bar_mapped_size) { + ret = -ENODEV; + goto unlock; + } + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) { + ret = -EBUSY; + goto unlock; + } + + writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR); + nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE); + + /* + * Read controller status to flush the previous write and trigger a + * pcie read error. + */ + readl(dev->bar + NVME_REG_CSTS); +unlock: + mutex_unlock(&dev->shutdown_lock); + return ret; +} + static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { struct nvme_command c = { }; @@ -2859,6 +2894,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read64 = nvme_pci_reg_read64, .free_ctrl = nvme_pci_free_ctrl, .submit_async_event = nvme_pci_submit_async_event, + .subsystem_reset = nvme_pci_subsystem_reset, .get_address = nvme_pci_get_address, .print_device_info = nvme_pci_print_device_info, .supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma, @@ -3015,6 +3051,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (IS_ERR(dev)) return PTR_ERR(dev); + result = nvme_add_ctrl(&dev->ctrl); + if (result) + goto out_put_ctrl; + result = nvme_dev_map(dev); if (result) goto out_uninit_ctrl; @@ -3101,6 +3141,7 @@ out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: nvme_uninit_ctrl(&dev->ctrl); +out_put_ctrl: nvme_put_ctrl(&dev->ctrl); return result; } diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c index 8fa1ffcdaed4..7347ddf85f00 100644 --- a/drivers/nvme/host/pr.c +++ b/drivers/nvme/host/pr.c @@ -72,12 +72,12 @@ static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c, return nvme_submit_sync_cmd(ns->queue, c, data, data_len); } -static int nvme_sc_to_pr_err(int nvme_sc) +static int nvme_status_to_pr_err(int status) { - if (nvme_is_path_error(nvme_sc)) + if (nvme_is_path_error(status)) return PR_STS_PATH_FAILED; - switch (nvme_sc & 0x7ff) { + switch (status & NVME_SCT_SC_MASK) { case NVME_SC_SUCCESS: return PR_STS_SUCCESS; case NVME_SC_RESERVATION_CONFLICT: @@ -121,7 +121,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, if (ret < 0) return ret; - return nvme_sc_to_pr_err(ret); + return nvme_status_to_pr_err(ret); } static int nvme_pr_register(struct block_device *bdev, u64 old, @@ -196,7 +196,7 @@ retry: if (ret < 0) return ret; - return nvme_sc_to_pr_err(ret); + return nvme_status_to_pr_err(ret); } static int nvme_pr_read_keys(struct block_device *bdev, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 51a62b0c645a..2eb33842f971 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2201,6 +2201,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, + .subsystem_reset = nvmf_subsystem_reset, .free_ctrl = nvme_rdma_free_ctrl, .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_delete_ctrl, @@ -2237,12 +2238,11 @@ nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts) return found; } -static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, +static struct nvme_rdma_ctrl *nvme_rdma_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_rdma_ctrl *ctrl; int ret; - bool changed; ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) @@ -2304,6 +2304,30 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, if (ret) goto out_kfree_queues; + return ctrl; + +out_kfree_queues: + kfree(ctrl->queues); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + bool changed; + int ret; + + ctrl = nvme_rdma_alloc_ctrl(dev, opts); + if (IS_ERR(ctrl)) + return ERR_CAST(ctrl); + + ret = nvme_add_ctrl(&ctrl->ctrl); + if (ret) + goto out_put_ctrl; + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); WARN_ON_ONCE(!changed); @@ -2322,15 +2346,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); -out_kfree_queues: - kfree(ctrl->queues); -out_free_ctrl: - kfree(ctrl); - return ERR_PTR(ret); } static struct nvmf_transport_ops nvme_rdma_transport = { diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8b5e4327fe83..a2a47d3ab99f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2662,6 +2662,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, + .subsystem_reset = nvmf_subsystem_reset, .free_ctrl = nvme_tcp_free_ctrl, .submit_async_event = nvme_tcp_submit_async_event, .delete_ctrl = nvme_tcp_delete_ctrl, @@ -2686,7 +2687,7 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) return found; } -static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, +static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_tcp_ctrl *ctrl; @@ -2761,6 +2762,28 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, if (ret) goto out_kfree_queues; + return ctrl; +out_kfree_queues: + kfree(ctrl->queues); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + int ret; + + ctrl = nvme_tcp_alloc_ctrl(dev, opts); + if (IS_ERR(ctrl)) + return ERR_CAST(ctrl); + + ret = nvme_add_ctrl(&ctrl->ctrl); + if (ret) + goto out_put_ctrl; + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { WARN_ON_ONCE(1); ret = -EINTR; @@ -2782,15 +2805,11 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); -out_kfree_queues: - kfree(ctrl->queues); -out_free_ctrl: - kfree(ctrl); - return ERR_PTR(ret); } static struct nvmf_transport_ops nvme_tcp_transport = { diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 77aa0f440a6d..9a06f9d98cd6 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -108,13 +108,12 @@ free_data: void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, struct nvme_zone_info *zi) { - lim->zoned = 1; + lim->features |= BLK_FEAT_ZONED; lim->max_open_zones = zi->max_open_zones; lim->max_active_zones = zi->max_active_zones; lim->max_zone_append_sectors = ns->ctrl->max_zone_append; lim->chunk_sectors = ns->head->zsze = nvme_lba_to_sect(ns->head, zi->zone_size); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); } static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 872dd1a0acd8..46be031f91b4 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -6,7 +6,6 @@ config NVME_TARGET depends on CONFIGFS_FS select NVME_KEYRING if NVME_TARGET_TCP_TLS select KEYS if NVME_TARGET_TCP_TLS - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY select SGL_ALLOC help This enabled target side support for the NVMe protocol, that is @@ -18,6 +17,15 @@ config NVME_TARGET To configure the NVMe target you probably want to use the nvmetcli tool from http://git.infradead.org/users/hch/nvmetcli.git. +config NVME_TARGET_DEBUGFS + bool "NVMe Target debugfs support" + depends on NVME_TARGET + help + This enables debugfs support to display the connected controllers + to each subsystem + + If unsure, say N. + config NVME_TARGET_PASSTHRU bool "NVMe Target Passthrough support" depends on NVME_TARGET diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index c66820102493..c402c44350b2 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ discovery.o io-cmd-file.o io-cmd-bdev.o +nvmet-$(CONFIG_NVME_TARGET_DEBUGFS) += debugfs.o nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o nvmet-$(CONFIG_NVME_TARGET_AUTH) += fabrics-cmd-auth.o auth.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index f5b7054a4a05..f7e1156ac7ec 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -344,7 +344,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) pr_debug("unhandled lid %d on qid %d\n", req->cmd->get_log_page.lid, req->sq->qid); req->error_loc = offsetof(struct nvme_get_log_page_command, lid); - nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); + nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_STATUS_DNR); } static void nvmet_execute_identify_ctrl(struct nvmet_req *req) @@ -496,7 +496,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { req->error_loc = offsetof(struct nvme_identify, nsid); - status = NVME_SC_INVALID_NS | NVME_SC_DNR; + status = NVME_SC_INVALID_NS | NVME_STATUS_DNR; goto out; } @@ -662,7 +662,7 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req) if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off, off) != NVME_IDENTIFY_DATA_SIZE - off) - status = NVME_SC_INTERNAL | NVME_SC_DNR; + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; out: nvmet_req_complete(req, status); @@ -724,7 +724,7 @@ static void nvmet_execute_identify(struct nvmet_req *req) pr_debug("unhandled identify cns %d on qid %d\n", req->cmd->identify.cns, req->sq->qid); req->error_loc = offsetof(struct nvme_identify, cns); - nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); + nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_STATUS_DNR); } /* @@ -807,7 +807,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) if (val32 & ~mask) { req->error_loc = offsetof(struct nvme_common_command, cdw11); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } WRITE_ONCE(req->sq->ctrl->aen_enabled, val32); @@ -833,7 +833,7 @@ void nvmet_execute_set_features(struct nvmet_req *req) ncqr = (cdw11 >> 16) & 0xffff; nsqr = cdw11 & 0xffff; if (ncqr == 0xffff || nsqr == 0xffff) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; break; } nvmet_set_result(req, @@ -846,14 +846,14 @@ void nvmet_execute_set_features(struct nvmet_req *req) status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL); break; case NVME_FEAT_HOST_ID: - status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + status = NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; break; case NVME_FEAT_WRITE_PROTECT: status = nvmet_set_feat_write_protect(req); break; default: req->error_loc = offsetof(struct nvme_common_command, cdw10); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; break; } @@ -939,7 +939,7 @@ void nvmet_execute_get_features(struct nvmet_req *req) if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) { req->error_loc = offsetof(struct nvme_common_command, cdw11); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; break; } @@ -952,7 +952,7 @@ void nvmet_execute_get_features(struct nvmet_req *req) default: req->error_loc = offsetof(struct nvme_common_command, cdw10); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; break; } @@ -969,7 +969,7 @@ void nvmet_execute_async_event(struct nvmet_req *req) mutex_lock(&ctrl->lock); if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) { mutex_unlock(&ctrl->lock); - nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR); + nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_STATUS_DNR); return; } ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req; @@ -1006,7 +1006,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvme_is_fabrics(cmd)) return nvmet_parse_fabrics_admin_cmd(req); if (unlikely(!nvmet_check_auth_status(req))) - return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; + return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) return nvmet_parse_discovery_cmd(req); diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 7d2633940f9b..8bc3f431c77f 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -314,7 +314,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, req->sq->dhchap_c1, challenge, shash_len); if (ret) - goto out_free_response; + goto out_free_challenge; } pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", @@ -325,7 +325,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, GFP_KERNEL); if (!shash) { ret = -ENOMEM; - goto out_free_response; + goto out_free_challenge; } shash->tfm = shash_tfm; ret = crypto_shash_init(shash); @@ -361,9 +361,10 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, goto out; ret = crypto_shash_final(shash, response); out: + kfree(shash); +out_free_challenge: if (challenge != req->sq->dhchap_c1) kfree(challenge); - kfree(shash); out_free_response: nvme_auth_free_key(transformed_key); out_free_tfm: @@ -427,14 +428,14 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, req->sq->dhchap_c2, challenge, shash_len); if (ret) - goto out_free_response; + goto out_free_challenge; } shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), GFP_KERNEL); if (!shash) { ret = -ENOMEM; - goto out_free_response; + goto out_free_challenge; } shash->tfm = shash_tfm; @@ -471,9 +472,10 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, goto out; ret = crypto_shash_final(shash, response); out: + kfree(shash); +out_free_challenge: if (challenge != req->sq->dhchap_c2) kfree(challenge); - kfree(shash); out_free_response: nvme_auth_free_key(transformed_key); out_free_tfm: diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 4ff460ba2826..ed2424f8a396 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -16,6 +16,7 @@ #include "trace.h" #include "nvmet.h" +#include "debugfs.h" struct kmem_cache *nvmet_bvec_cache; struct workqueue_struct *buffered_io_wq; @@ -55,18 +56,18 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) return NVME_SC_SUCCESS; case -ENOSPC: req->error_loc = offsetof(struct nvme_rw_command, length); - return NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; + return NVME_SC_CAP_EXCEEDED | NVME_STATUS_DNR; case -EREMOTEIO: req->error_loc = offsetof(struct nvme_rw_command, slba); - return NVME_SC_LBA_RANGE | NVME_SC_DNR; + return NVME_SC_LBA_RANGE | NVME_STATUS_DNR; case -EOPNOTSUPP: req->error_loc = offsetof(struct nvme_common_command, opcode); switch (req->cmd->common.opcode) { case nvme_cmd_dsm: case nvme_cmd_write_zeroes: - return NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; + return NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR; default: - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } break; case -ENODATA: @@ -76,7 +77,7 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) fallthrough; default: req->error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INTERNAL | NVME_SC_DNR; + return NVME_SC_INTERNAL | NVME_STATUS_DNR; } } @@ -86,7 +87,7 @@ u16 nvmet_report_invalid_opcode(struct nvmet_req *req) req->sq->qid); req->error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, @@ -97,7 +98,7 @@ u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, { if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { req->error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; } return 0; } @@ -106,7 +107,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) { if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { req->error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; } return 0; } @@ -115,7 +116,7 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) { if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) { req->error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR; } return 0; } @@ -145,7 +146,7 @@ static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl) while (ctrl->nr_async_event_cmds) { req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; mutex_unlock(&ctrl->lock); - nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR); + nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_STATUS_DNR); mutex_lock(&ctrl->lock); } mutex_unlock(&ctrl->lock); @@ -444,7 +445,7 @@ u16 nvmet_req_find_ns(struct nvmet_req *req) req->error_loc = offsetof(struct nvme_common_command, nsid); if (nvmet_subsys_nsid_exists(subsys, nsid)) return NVME_SC_INTERNAL_PATH_ERROR; - return NVME_SC_INVALID_NS | NVME_SC_DNR; + return NVME_SC_INVALID_NS | NVME_STATUS_DNR; } percpu_ref_get(&req->ns->ref); @@ -904,7 +905,7 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) return nvmet_parse_fabrics_io_cmd(req); if (unlikely(!nvmet_check_auth_status(req))) - return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; + return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; ret = nvmet_check_ctrl_status(req); if (unlikely(ret)) @@ -967,7 +968,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { req->error_loc = offsetof(struct nvme_common_command, flags); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto fail; } @@ -978,7 +979,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, */ if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) { req->error_loc = offsetof(struct nvme_common_command, flags); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto fail; } @@ -996,7 +997,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, trace_nvmet_req_init(req, req->cmd); if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto fail; } @@ -1023,7 +1024,7 @@ bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len) { if (unlikely(len != req->transfer_len)) { req->error_loc = offsetof(struct nvme_common_command, dptr); - nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); + nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR); return false; } @@ -1035,7 +1036,7 @@ bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len) { if (unlikely(data_len > req->transfer_len)) { req->error_loc = offsetof(struct nvme_common_command, dptr); - nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); + nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR); return false; } @@ -1304,18 +1305,18 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req) if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { pr_err("got cmd %d while CC.EN == 0 on qid = %d\n", req->cmd->common.opcode, req->sq->qid); - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; } if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n", req->cmd->common.opcode, req->sq->qid); - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; } if (unlikely(!nvmet_check_auth_status(req))) { pr_warn("qid %d not authenticated\n", req->sq->qid); - return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; + return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; } return 0; } @@ -1389,7 +1390,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, int ret; u16 status; - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; subsys = nvmet_find_get_subsys(req->port, subsysnqn); if (!subsys) { pr_warn("connect request for invalid subsystem %s!\n", @@ -1405,7 +1406,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, hostnqn, subsysnqn); req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); up_read(&nvmet_config_sem); - status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR; + status = NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvme_common_command, dptr); goto out_put_subsystem; } @@ -1456,7 +1457,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { - status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; goto out_free_sqs; } ctrl->cntlid = ret; @@ -1479,6 +1480,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, mutex_lock(&subsys->lock); list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); nvmet_setup_p2p_ns_map(ctrl, req); + nvmet_debugfs_ctrl_setup(ctrl); mutex_unlock(&subsys->lock); *ctrlp = ctrl; @@ -1513,6 +1515,8 @@ static void nvmet_ctrl_free(struct kref *ref) nvmet_destroy_auth(ctrl); + nvmet_debugfs_ctrl_free(ctrl); + ida_free(&cntlid_ida, ctrl->cntlid); nvmet_async_events_free(ctrl); @@ -1539,6 +1543,14 @@ void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); +ssize_t nvmet_ctrl_host_traddr(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_len) +{ + if (!ctrl->ops->host_traddr) + return -EOPNOTSUPP; + return ctrl->ops->host_traddr(ctrl, traddr, traddr_len); +} + static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, const char *subsysnqn) { @@ -1633,8 +1645,14 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, INIT_LIST_HEAD(&subsys->ctrls); INIT_LIST_HEAD(&subsys->hosts); + ret = nvmet_debugfs_subsys_setup(subsys); + if (ret) + goto free_subsysnqn; + return subsys; +free_subsysnqn: + kfree(subsys->subsysnqn); free_fr: kfree(subsys->firmware_rev); free_mn: @@ -1651,6 +1669,8 @@ static void nvmet_subsys_free(struct kref *ref) WARN_ON_ONCE(!xa_empty(&subsys->namespaces)); + nvmet_debugfs_subsys_free(subsys); + xa_destroy(&subsys->namespaces); nvmet_passthru_subsys_free(subsys); @@ -1705,11 +1725,18 @@ static int __init nvmet_init(void) if (error) goto out_free_nvmet_work_queue; - error = nvmet_init_configfs(); + error = nvmet_init_debugfs(); if (error) goto out_exit_discovery; + + error = nvmet_init_configfs(); + if (error) + goto out_exit_debugfs; + return 0; +out_exit_debugfs: + nvmet_exit_debugfs(); out_exit_discovery: nvmet_exit_discovery(); out_free_nvmet_work_queue: @@ -1726,6 +1753,7 @@ out_destroy_bvec_cache: static void __exit nvmet_exit(void) { nvmet_exit_configfs(); + nvmet_exit_debugfs(); nvmet_exit_discovery(); ida_destroy(&cntlid_ida); destroy_workqueue(nvmet_wq); diff --git a/drivers/nvme/target/debugfs.c b/drivers/nvme/target/debugfs.c new file mode 100644 index 000000000000..cb2befc8619e --- /dev/null +++ b/drivers/nvme/target/debugfs.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DebugFS interface for the NVMe target. + * Copyright (c) 2022-2024 Shadow + * Copyright (c) 2024 SUSE LLC + */ + +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> + +#include "nvmet.h" +#include "debugfs.h" + +struct dentry *nvmet_debugfs; + +#define NVMET_DEBUGFS_ATTR(field) \ + static int field##_open(struct inode *inode, struct file *file) \ + { return single_open(file, field##_show, inode->i_private); } \ + \ + static const struct file_operations field##_fops = { \ + .open = field##_open, \ + .read = seq_read, \ + .release = single_release, \ + } + +#define NVMET_DEBUGFS_RW_ATTR(field) \ + static int field##_open(struct inode *inode, struct file *file) \ + { return single_open(file, field##_show, inode->i_private); } \ + \ + static const struct file_operations field##_fops = { \ + .open = field##_open, \ + .read = seq_read, \ + .write = field##_write, \ + .release = single_release, \ + } + +static int nvmet_ctrl_hostnqn_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + + seq_puts(m, ctrl->hostnqn); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_hostnqn); + +static int nvmet_ctrl_kato_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + + seq_printf(m, "%d\n", ctrl->kato); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_kato); + +static int nvmet_ctrl_port_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + + seq_printf(m, "%d\n", le16_to_cpu(ctrl->port->disc_addr.portid)); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_port); + +static const char *const csts_state_names[] = { + [NVME_CSTS_RDY] = "ready", + [NVME_CSTS_CFS] = "fatal", + [NVME_CSTS_NSSRO] = "reset", + [NVME_CSTS_SHST_OCCUR] = "shutdown", + [NVME_CSTS_SHST_CMPLT] = "completed", + [NVME_CSTS_PP] = "paused", +}; + +static int nvmet_ctrl_state_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + bool sep = false; + int i; + + for (i = 0; i < 7; i++) { + int state = BIT(i); + + if (!(ctrl->csts & state)) + continue; + if (sep) + seq_puts(m, "|"); + sep = true; + if (csts_state_names[state]) + seq_puts(m, csts_state_names[state]); + else + seq_printf(m, "%d", state); + } + if (sep) + seq_printf(m, "\n"); + return 0; +} + +static ssize_t nvmet_ctrl_state_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + struct nvmet_ctrl *ctrl = m->private; + char reset[16]; + + if (count >= sizeof(reset)) + return -EINVAL; + if (copy_from_user(reset, buf, count)) + return -EFAULT; + if (!memcmp(reset, "fatal", 5)) + nvmet_ctrl_fatal_error(ctrl); + else + return -EINVAL; + return count; +} +NVMET_DEBUGFS_RW_ATTR(nvmet_ctrl_state); + +static int nvmet_ctrl_host_traddr_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + ssize_t size; + char buf[NVMF_TRADDR_SIZE + 1]; + + size = nvmet_ctrl_host_traddr(ctrl, buf, NVMF_TRADDR_SIZE); + if (size < 0) { + buf[0] = '\0'; + size = 0; + } + buf[size] = '\0'; + seq_printf(m, "%s\n", buf); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_host_traddr); + +int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl) +{ + char name[32]; + struct dentry *parent = ctrl->subsys->debugfs_dir; + int ret; + + if (!parent) + return -ENODEV; + snprintf(name, sizeof(name), "ctrl%d", ctrl->cntlid); + ctrl->debugfs_dir = debugfs_create_dir(name, parent); + if (IS_ERR(ctrl->debugfs_dir)) { + ret = PTR_ERR(ctrl->debugfs_dir); + ctrl->debugfs_dir = NULL; + return ret; + } + debugfs_create_file("port", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_port_fops); + debugfs_create_file("hostnqn", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_hostnqn_fops); + debugfs_create_file("kato", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_kato_fops); + debugfs_create_file("state", S_IRUSR | S_IWUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_state_fops); + debugfs_create_file("host_traddr", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_host_traddr_fops); + return 0; +} + +void nvmet_debugfs_ctrl_free(struct nvmet_ctrl *ctrl) +{ + debugfs_remove_recursive(ctrl->debugfs_dir); +} + +int nvmet_debugfs_subsys_setup(struct nvmet_subsys *subsys) +{ + int ret = 0; + + subsys->debugfs_dir = debugfs_create_dir(subsys->subsysnqn, + nvmet_debugfs); + if (IS_ERR(subsys->debugfs_dir)) { + ret = PTR_ERR(subsys->debugfs_dir); + subsys->debugfs_dir = NULL; + } + return ret; +} + +void nvmet_debugfs_subsys_free(struct nvmet_subsys *subsys) +{ + debugfs_remove_recursive(subsys->debugfs_dir); +} + +int __init nvmet_init_debugfs(void) +{ + struct dentry *parent; + + parent = debugfs_create_dir("nvmet", NULL); + if (IS_ERR(parent)) { + pr_warn("%s: failed to create debugfs directory\n", "nvmet"); + return PTR_ERR(parent); + } + nvmet_debugfs = parent; + return 0; +} + +void nvmet_exit_debugfs(void) +{ + debugfs_remove_recursive(nvmet_debugfs); +} diff --git a/drivers/nvme/target/debugfs.h b/drivers/nvme/target/debugfs.h new file mode 100644 index 000000000000..cfb8bbf6a297 --- /dev/null +++ b/drivers/nvme/target/debugfs.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DebugFS interface for the NVMe target. + * Copyright (c) 2022-2024 Shadow + * Copyright (c) 2024 SUSE LLC + */ +#ifndef NVMET_DEBUGFS_H +#define NVMET_DEBUGFS_H + +#include <linux/types.h> + +#ifdef CONFIG_NVME_TARGET_DEBUGFS +int nvmet_debugfs_subsys_setup(struct nvmet_subsys *subsys); +void nvmet_debugfs_subsys_free(struct nvmet_subsys *subsys); +int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl); +void nvmet_debugfs_ctrl_free(struct nvmet_ctrl *ctrl); + +int __init nvmet_init_debugfs(void); +void nvmet_exit_debugfs(void); +#else +static inline int nvmet_debugfs_subsys_setup(struct nvmet_subsys *subsys) +{ + return 0; +} +static inline void nvmet_debugfs_subsys_free(struct nvmet_subsys *subsys){} + +static inline int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl) +{ + return 0; +} +static inline void nvmet_debugfs_ctrl_free(struct nvmet_ctrl *ctrl) {} + +static inline int __init nvmet_init_debugfs(void) +{ + return 0; +} + +static inline void nvmet_exit_debugfs(void) {} + +#endif + +#endif /* NVMET_DEBUGFS_H */ diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index ce54da8c6b36..28843df5fa7c 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -179,7 +179,7 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) if (req->cmd->get_log_page.lid != NVME_LOG_DISC) { req->error_loc = offsetof(struct nvme_get_log_page_command, lid); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -187,7 +187,7 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) if (offset & 0x3) { req->error_loc = offsetof(struct nvme_get_log_page_command, lpo); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -256,7 +256,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) if (req->cmd->identify.cns != NVME_ID_CNS_CTRL) { req->error_loc = offsetof(struct nvme_identify, cns); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -320,7 +320,7 @@ static void nvmet_execute_disc_set_features(struct nvmet_req *req) default: req->error_loc = offsetof(struct nvme_common_command, cdw10); - stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + stat = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; break; } @@ -345,7 +345,7 @@ static void nvmet_execute_disc_get_features(struct nvmet_req *req) default: req->error_loc = offsetof(struct nvme_common_command, cdw10); - stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + stat = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; break; } @@ -361,7 +361,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) cmd->common.opcode); req->error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } switch (cmd->common.opcode) { @@ -386,7 +386,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) default: pr_debug("unhandled cmd %d\n", cmd->common.opcode); req->error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } } diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index cb34d644ed08..3f2857c17d95 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -189,26 +189,26 @@ void nvmet_execute_auth_send(struct nvmet_req *req) u8 dhchap_status; if (req->cmd->auth_send.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvmf_auth_send_command, secp); goto done; } if (req->cmd->auth_send.spsp0 != 0x01) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvmf_auth_send_command, spsp0); goto done; } if (req->cmd->auth_send.spsp1 != 0x01) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvmf_auth_send_command, spsp1); goto done; } tl = le32_to_cpu(req->cmd->auth_send.tl); if (!tl) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvmf_auth_send_command, tl); goto done; @@ -437,26 +437,26 @@ void nvmet_execute_auth_receive(struct nvmet_req *req) u16 status = 0; if (req->cmd->auth_receive.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvmf_auth_receive_command, secp); goto done; } if (req->cmd->auth_receive.spsp0 != 0x01) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvmf_auth_receive_command, spsp0); goto done; } if (req->cmd->auth_receive.spsp1 != 0x01) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvmf_auth_receive_command, spsp1); goto done; } al = le32_to_cpu(req->cmd->auth_receive.al); if (!al) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvmf_auth_receive_command, al); goto done; diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 69d77d34bec1..c4b2eddd5666 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -18,7 +18,7 @@ static void nvmet_execute_prop_set(struct nvmet_req *req) if (req->cmd->prop_set.attrib & 1) { req->error_loc = offsetof(struct nvmf_property_set_command, attrib); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -29,7 +29,7 @@ static void nvmet_execute_prop_set(struct nvmet_req *req) default: req->error_loc = offsetof(struct nvmf_property_set_command, offset); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } out: nvmet_req_complete(req, status); @@ -50,7 +50,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req) val = ctrl->cap; break; default: - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; break; } } else { @@ -65,7 +65,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req) val = ctrl->csts; break; default: - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; break; } } @@ -105,7 +105,7 @@ u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req) pr_debug("received unknown capsule type 0x%x\n", cmd->fabrics.fctype); req->error_loc = offsetof(struct nvmf_common_command, fctype); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } return 0; @@ -128,7 +128,7 @@ u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req) pr_debug("received unknown capsule type 0x%x\n", cmd->fabrics.fctype); req->error_loc = offsetof(struct nvmf_common_command, fctype); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } return 0; @@ -147,14 +147,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) pr_warn("queue size zero!\n"); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); - ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; goto err; } if (ctrl->sqs[qid] != NULL) { pr_warn("qid %u has already been created\n", qid); req->error_loc = offsetof(struct nvmf_connect_command, qid); - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; } /* for fabrics, this value applies to only the I/O Submission Queues */ @@ -163,14 +163,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) sqsize, mqes, ctrl->cntlid); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); - return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + return NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; } old = cmpxchg(&req->sq->ctrl, NULL, ctrl); if (old) { pr_warn("queue already connected!\n"); req->error_loc = offsetof(struct nvmf_connect_command, opcode); - return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; } /* note: convert queue size from 0's-based value to 1's-based value */ @@ -230,14 +230,14 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) pr_warn("invalid connect version (%d).\n", le16_to_cpu(c->recfmt)); req->error_loc = offsetof(struct nvmf_connect_command, recfmt); - status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + status = NVME_SC_CONNECT_FORMAT | NVME_STATUS_DNR; goto out; } if (unlikely(d->cntlid != cpu_to_le16(0xffff))) { pr_warn("connect attempt for invalid controller ID %#x\n", d->cntlid); - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); goto out; } @@ -257,7 +257,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) dhchap_status); nvmet_ctrl_put(ctrl); if (dhchap_status == NVME_AUTH_DHCHAP_FAILURE_FAILED) - status = (NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR); + status = (NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR); else status = NVME_SC_INTERNAL; goto out; @@ -305,7 +305,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) if (c->recfmt != 0) { pr_warn("invalid connect version (%d).\n", le16_to_cpu(c->recfmt)); - status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + status = NVME_SC_CONNECT_FORMAT | NVME_STATUS_DNR; goto out; } @@ -314,13 +314,13 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) ctrl = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, le16_to_cpu(d->cntlid), req); if (!ctrl) { - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; goto out; } if (unlikely(qid > ctrl->subsys->max_qid)) { pr_warn("invalid queue id (%d)\n", qid); - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(qid); goto out_ctrl_put; } @@ -350,13 +350,13 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req) pr_debug("invalid command 0x%x on unconnected queue.\n", cmd->fabrics.opcode); req->error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { pr_debug("invalid capsule type 0x%x on unconnected queue.\n", cmd->fabrics.fctype); req->error_loc = offsetof(struct nvmf_common_command, fctype); - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } if (cmd->connect.qid == 0) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 381b4394731f..3ef4beacde32 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2934,6 +2934,38 @@ nvmet_fc_discovery_chg(struct nvmet_port *port) tgtport->ops->discovery_event(&tgtport->fc_target_port); } +static ssize_t +nvmet_fc_host_traddr(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_size) +{ + struct nvmet_sq *sq = ctrl->sqs[0]; + struct nvmet_fc_tgt_queue *queue = + container_of(sq, struct nvmet_fc_tgt_queue, nvme_sq); + struct nvmet_fc_tgtport *tgtport = queue->assoc ? queue->assoc->tgtport : NULL; + struct nvmet_fc_hostport *hostport = queue->assoc ? queue->assoc->hostport : NULL; + u64 wwnn, wwpn; + ssize_t ret = 0; + + if (!tgtport || !nvmet_fc_tgtport_get(tgtport)) + return -ENODEV; + if (!hostport || !nvmet_fc_hostport_get(hostport)) { + ret = -ENODEV; + goto out_put; + } + + if (tgtport->ops->host_traddr) { + ret = tgtport->ops->host_traddr(hostport->hosthandle, &wwnn, &wwpn); + if (ret) + goto out_put_host; + ret = snprintf(traddr, traddr_size, "nn-0x%llx:pn-0x%llx", wwnn, wwpn); + } +out_put_host: + nvmet_fc_hostport_put(hostport); +out_put: + nvmet_fc_tgtport_put(tgtport); + return ret; +} + static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_FC, @@ -2943,6 +2975,7 @@ static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { .queue_response = nvmet_fc_fcp_nvme_cmd_done, .delete_ctrl = nvmet_fc_delete_ctrl, .discovery_chg = nvmet_fc_discovery_chg, + .host_traddr = nvmet_fc_host_traddr, }; static int __init nvmet_fc_init_module(void) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 913cd2ec7a6f..e1abb27927ff 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -492,6 +492,16 @@ fcloop_t2h_host_release(void *hosthandle) /* host handle ignored for now */ } +static int +fcloop_t2h_host_traddr(void *hosthandle, u64 *wwnn, u64 *wwpn) +{ + struct fcloop_rport *rport = hosthandle; + + *wwnn = rport->lport->localport->node_name; + *wwpn = rport->lport->localport->port_name; + return 0; +} + /* * Simulate reception of RSCN and converting it to a initiator transport * call to rescan a remote port. @@ -1074,6 +1084,7 @@ static struct nvmet_fc_target_template tgttemplate = { .ls_req = fcloop_t2h_ls_req, .ls_abort = fcloop_t2h_ls_abort, .host_release = fcloop_t2h_host_release, + .host_traddr = fcloop_t2h_host_traddr, .max_hw_queues = FCLOOP_HW_QUEUES, .max_sgl_segments = FCLOOP_SGL_SEGS, .max_dif_sgl_segments = FCLOOP_SGL_SEGS, diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 6426aac2634a..0bda83d0fc3e 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -61,15 +61,17 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) { struct blk_integrity *bi = bdev_get_integrity(ns->bdev); - if (bi) { + if (!bi) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC) { ns->metadata_size = bi->tuple_size; - if (bi->profile == &t10_pi_type1_crc) + if (bi->flags & BLK_INTEGRITY_REF_TAG) ns->pi_type = NVME_NS_DPS_PI_TYPE1; - else if (bi->profile == &t10_pi_type3_crc) - ns->pi_type = NVME_NS_DPS_PI_TYPE3; else - /* Unsupported metadata type */ - ns->metadata_size = 0; + ns->pi_type = NVME_NS_DPS_PI_TYPE3; + } else { + ns->metadata_size = 0; } } @@ -102,7 +104,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) ns->pi_type = 0; ns->metadata_size = 0; - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10)) + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) nvmet_bdev_ns_enable_integrity(ns); if (bdev_is_zoned(ns->bdev)) { @@ -135,11 +137,11 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) */ switch (blk_sts) { case BLK_STS_NOSPC: - status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; + status = NVME_SC_CAP_EXCEEDED | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvme_rw_command, length); break; case BLK_STS_TARGET: - status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + status = NVME_SC_LBA_RANGE | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvme_rw_command, slba); break; case BLK_STS_NOTSUPP: @@ -147,10 +149,10 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) switch (req->cmd->common.opcode) { case nvme_cmd_dsm: case nvme_cmd_write_zeroes: - status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; + status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR; break; default: - status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } break; case BLK_STS_MEDIUM: @@ -159,7 +161,7 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) break; case BLK_STS_IOERR: default: - status = NVME_SC_INTERNAL | NVME_SC_DNR; + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvme_common_command, opcode); } @@ -356,7 +358,7 @@ u16 nvmet_bdev_flush(struct nvmet_req *req) return 0; if (blkdev_issue_flush(req->ns->bdev)) - return NVME_SC_INTERNAL | NVME_SC_DNR; + return NVME_SC_INTERNAL | NVME_STATUS_DNR; return 0; } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index e589915ddef8..e32790d8fc26 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -555,6 +555,10 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, goto out; } + ret = nvme_add_ctrl(&ctrl->ctrl); + if (ret) + goto out_put_ctrl; + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) WARN_ON_ONCE(1); @@ -611,6 +615,7 @@ out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); out: if (ret > 0) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 2f22b07eab29..190f55e6d753 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -230,7 +230,9 @@ struct nvmet_ctrl { struct device *p2p_client; struct radix_tree_root p2p_ns_map; - +#ifdef CONFIG_NVME_TARGET_DEBUGFS + struct dentry *debugfs_dir; +#endif spinlock_t error_lock; u64 err_counter; struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; @@ -262,7 +264,9 @@ struct nvmet_subsys { struct list_head hosts; bool allow_any_host; - +#ifdef CONFIG_NVME_TARGET_DEBUGFS + struct dentry *debugfs_dir; +#endif u16 max_qid; u64 ver; @@ -350,6 +354,8 @@ struct nvmet_fabrics_ops { void (*delete_ctrl)(struct nvmet_ctrl *ctrl); void (*disc_traddr)(struct nvmet_req *req, struct nvmet_port *port, char *traddr); + ssize_t (*host_traddr)(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_len); u16 (*install_queue)(struct nvmet_sq *nvme_sq); void (*discovery_chg)(struct nvmet_port *port); u8 (*get_mdts)(const struct nvmet_ctrl *ctrl); @@ -498,6 +504,8 @@ struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, struct nvmet_req *req); void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); u16 nvmet_check_ctrl_status(struct nvmet_req *req); +ssize_t nvmet_ctrl_host_traddr(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_len); struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, enum nvme_subsys_type type); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index f003782d4ecf..24d0e2418d2e 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -306,7 +306,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) ns = nvme_find_get_ns(ctrl, nsid); if (unlikely(!ns)) { pr_err("failed to get passthru ns nsid:%u\n", nsid); - status = NVME_SC_INVALID_NS | NVME_SC_DNR; + status = NVME_SC_INVALID_NS | NVME_STATUS_DNR; goto out; } @@ -426,7 +426,7 @@ u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) * emulated in the future if regular targets grow support for * this feature. */ - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } return nvmet_setup_passthru_command(req); @@ -478,7 +478,7 @@ static u16 nvmet_passthru_get_set_features(struct nvmet_req *req) case NVME_FEAT_RESV_PERSIST: /* No reservations, see nvmet_parse_passthru_io_cmd() */ default: - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; } } @@ -546,7 +546,7 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) req->p.use_workqueue = true; return NVME_SC_SUCCESS; } - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; case NVME_ID_CNS_NS: req->execute = nvmet_passthru_execute_cmd; req->p.use_workqueue = true; @@ -558,7 +558,7 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) req->p.use_workqueue = true; return NVME_SC_SUCCESS; } - return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; default: return nvmet_setup_passthru_command(req); } diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 689bb5d3cfdc..1eff8ca6a5f1 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -852,12 +852,12 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) if (!nvme_is_write(rsp->req.cmd)) { rsp->req.error_loc = offsetof(struct nvme_common_command, opcode); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); - return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_OFFSET | NVME_STATUS_DNR; } /* no data command? */ @@ -919,7 +919,7 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) pr_err("invalid SGL subtype: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } case NVME_KEY_SGL_FMT_DATA_DESC: switch (sgl->type & 0xf) { @@ -931,12 +931,12 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) pr_err("invalid SGL subtype: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } default: pr_err("invalid SGL type: %#x\n", sgl->type); rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); - return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_TYPE | NVME_STATUS_DNR; } } @@ -2000,6 +2000,17 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, } } +static ssize_t nvmet_rdma_host_port_addr(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_len) +{ + struct nvmet_sq *nvme_sq = ctrl->sqs[0]; + struct nvmet_rdma_queue *queue = + container_of(nvme_sq, struct nvmet_rdma_queue, nvme_sq); + + return snprintf(traddr, traddr_len, "%pISc", + (struct sockaddr *)&queue->cm_id->route.addr.dst_addr); +} + static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) { if (ctrl->pi_support) @@ -2024,6 +2035,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .queue_response = nvmet_rdma_queue_response, .delete_ctrl = nvmet_rdma_delete_ctrl, .disc_traddr = nvmet_rdma_disc_port_addr, + .host_traddr = nvmet_rdma_host_port_addr, .get_mdts = nvmet_rdma_get_mdts, .get_max_queue_size = nvmet_rdma_get_max_queue_size, }; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 380f22ee3ebb..5bff0d5464d1 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -416,10 +416,10 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET)) { if (!nvme_is_write(cmd->req.cmd)) - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; if (len > cmd->req.port->inline_data_size) - return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + return NVME_SC_SGL_INVALID_OFFSET | NVME_STATUS_DNR; cmd->pdu_len = len; } cmd->req.transfer_len += len; @@ -2167,6 +2167,19 @@ static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, } } +static ssize_t nvmet_tcp_host_port_addr(struct nvmet_ctrl *ctrl, + char *traddr, size_t traddr_len) +{ + struct nvmet_sq *sq = ctrl->sqs[0]; + struct nvmet_tcp_queue *queue = + container_of(sq, struct nvmet_tcp_queue, nvme_sq); + + if (queue->sockaddr_peer.ss_family == AF_UNSPEC) + return -EINVAL; + return snprintf(traddr, traddr_len, "%pISc", + (struct sockaddr *)&queue->sockaddr_peer); +} + static const struct nvmet_fabrics_ops nvmet_tcp_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_TCP, @@ -2177,6 +2190,7 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = { .delete_ctrl = nvmet_tcp_delete_ctrl, .install_queue = nvmet_tcp_install_queue, .disc_traddr = nvmet_tcp_disc_port_addr, + .host_traddr = nvmet_tcp_host_port_addr, }; static int __init nvmet_tcp_init(void) diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 0021d06041c1..af9e13be7678 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -100,7 +100,7 @@ void nvmet_execute_identify_ns_zns(struct nvmet_req *req) if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { req->error_loc = offsetof(struct nvme_identify, nsid); - status = NVME_SC_INVALID_NS | NVME_SC_DNR; + status = NVME_SC_INVALID_NS | NVME_STATUS_DNR; goto out; } @@ -121,7 +121,7 @@ void nvmet_execute_identify_ns_zns(struct nvmet_req *req) } if (!bdev_is_zoned(req->ns->bdev)) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvme_identify, nsid); goto out; } @@ -158,17 +158,17 @@ static u16 nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req *req) if (sect >= get_capacity(req->ns->bdev->bd_disk)) { req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, slba); - return NVME_SC_LBA_RANGE | NVME_SC_DNR; + return NVME_SC_LBA_RANGE | NVME_STATUS_DNR; } if (out_bufsize < sizeof(struct nvme_zone_report)) { req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, numd); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) { req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, zra); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } switch (req->cmd->zmr.pr) { @@ -177,7 +177,7 @@ static u16 nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req *req) break; default: req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, pr); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } switch (req->cmd->zmr.zrasf) { @@ -193,7 +193,7 @@ static u16 nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req *req) default: req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, zrasf); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } return NVME_SC_SUCCESS; @@ -341,7 +341,7 @@ static u16 blkdev_zone_mgmt_errno_to_nvme_status(int ret) return NVME_SC_SUCCESS; case -EINVAL: case -EIO: - return NVME_SC_ZONE_INVALID_TRANSITION | NVME_SC_DNR; + return NVME_SC_ZONE_INVALID_TRANSITION | NVME_STATUS_DNR; default: return NVME_SC_INTERNAL; } @@ -463,7 +463,7 @@ static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req) default: /* this is needed to quiet compiler warning */ req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa); - return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; } return NVME_SC_SUCCESS; @@ -481,7 +481,7 @@ static void nvmet_bdev_zmgmt_send_work(struct work_struct *w) if (op == REQ_OP_LAST) { req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa); - status = NVME_SC_ZONE_INVALID_TRANSITION | NVME_SC_DNR; + status = NVME_SC_ZONE_INVALID_TRANSITION | NVME_STATUS_DNR; goto out; } @@ -493,13 +493,13 @@ static void nvmet_bdev_zmgmt_send_work(struct work_struct *w) if (sect >= get_capacity(bdev->bd_disk)) { req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba); - status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + status = NVME_SC_LBA_RANGE | NVME_STATUS_DNR; goto out; } if (sect & (zone_sectors - 1)) { req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -551,13 +551,13 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) if (sect >= get_capacity(req->ns->bdev->bd_disk)) { req->error_loc = offsetof(struct nvme_rw_command, slba); - status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + status = NVME_SC_LBA_RANGE | NVME_STATUS_DNR; goto out; } if (sect & (bdev_zone_sectors(req->ns->bdev) - 1)) { req->error_loc = offsetof(struct nvme_rw_command, slba); - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -590,7 +590,7 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) } if (total_len != nvmet_rw_data_len(req)) { - status = NVME_SC_INTERNAL | NVME_SC_DNR; + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; goto out_put_bio; } diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index e1ec3b7200d7..f8dd7eb40fbe 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -396,10 +396,9 @@ static int nvmem_sysfs_setup_compat(struct nvmem_device *nvmem, if (!config->base_dev) return -EINVAL; - if (config->type == NVMEM_TYPE_FRAM) - bin_attr_nvmem_eeprom_compat.attr.name = "fram"; - nvmem->eeprom = bin_attr_nvmem_eeprom_compat; + if (config->type == NVMEM_TYPE_FRAM) + nvmem->eeprom.attr.name = "fram"; nvmem->eeprom.attr.mode = nvmem_bin_attr_get_umode(nvmem); nvmem->eeprom.size = nvmem->size; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -463,7 +462,7 @@ static int nvmem_populate_sysfs_cells(struct nvmem_device *nvmem) "%s@%x,%x", entry->name, entry->offset, entry->bit_offset); - attrs[i].attr.mode = 0444; + attrs[i].attr.mode = 0444 & nvmem_bin_attr_get_umode(nvmem); attrs[i].size = entry->bytes; attrs[i].read = &nvmem_cell_attr_read; attrs[i].private = entry; diff --git a/drivers/nvmem/meson-efuse.c b/drivers/nvmem/meson-efuse.c index 33678d0af2c2..6c2f80e166e2 100644 --- a/drivers/nvmem/meson-efuse.c +++ b/drivers/nvmem/meson-efuse.c @@ -18,18 +18,24 @@ static int meson_efuse_read(void *context, unsigned int offset, void *val, size_t bytes) { struct meson_sm_firmware *fw = context; + int ret; - return meson_sm_call_read(fw, (u8 *)val, bytes, SM_EFUSE_READ, offset, - bytes, 0, 0, 0); + ret = meson_sm_call_read(fw, (u8 *)val, bytes, SM_EFUSE_READ, offset, + bytes, 0, 0, 0); + + return ret < 0 ? ret : 0; } static int meson_efuse_write(void *context, unsigned int offset, void *val, size_t bytes) { struct meson_sm_firmware *fw = context; + int ret; + + ret = meson_sm_call_write(fw, (u8 *)val, bytes, SM_EFUSE_WRITE, offset, + bytes, 0, 0, 0); - return meson_sm_call_write(fw, (u8 *)val, bytes, SM_EFUSE_WRITE, offset, - bytes, 0, 0, 0); + return ret < 0 ? ret : 0; } static const struct of_device_id meson_efuse_match[] = { diff --git a/drivers/nvmem/rmem.c b/drivers/nvmem/rmem.c index 752d0bf4445e..7f907c5a445e 100644 --- a/drivers/nvmem/rmem.c +++ b/drivers/nvmem/rmem.c @@ -46,7 +46,10 @@ static int rmem_read(void *context, unsigned int offset, memunmap(addr); - return count; + if (count < 0) + return count; + + return count == bytes ? 0 : -EIO; } static int rmem_probe(struct platform_device *pdev) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 462375b293e4..c94203ce65bb 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -81,7 +81,8 @@ EXPORT_SYMBOL_GPL(of_irq_find_parent); /* * These interrupt controllers abuse interrupt-map for unspeakable * reasons and rely on the core code to *ignore* it (the drivers do - * their own parsing of the property). + * their own parsing of the property). The PAsemi entry covers a + * non-sensical interrupt-map that is better left ignored. * * If you think of adding to the list for something *new*, think * again. There is a high chance that you will be sent back to the @@ -95,6 +96,7 @@ static const char * const of_irq_imap_abusers[] = { "fsl,ls1043a-extirq", "fsl,ls1088a-extirq", "renesas,rza1-irqc", + "pasemi,rootbus", NULL, }; @@ -293,20 +295,8 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq) imaplen -= imap - oldimap; pr_debug(" -> imaplen=%d\n", imaplen); } - if (!match) { - if (intc) { - /* - * The PASEMI Nemo is a known offender, so - * let's only warn for anyone else. - */ - WARN(!IS_ENABLED(CONFIG_PPC_PASEMI), - "%pOF interrupt-map failed, using interrupt-controller\n", - ipar); - return 0; - } - + if (!match) goto fail; - } /* * Successfully parsed an interrupt-map translation; copy new diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c index 78c490e0505a..0a02e85a8951 100644 --- a/drivers/perf/riscv_pmu.c +++ b/drivers/perf/riscv_pmu.c @@ -167,7 +167,7 @@ u64 riscv_pmu_event_update(struct perf_event *event) unsigned long cmask; u64 oldval, delta; - if (!rvpmu->ctr_read) + if (!rvpmu->ctr_read || (hwc->state & PERF_HES_UPTODATE)) return 0; cmask = riscv_pmu_ctr_get_width_mask(event); diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index a2e4005e1fd0..4e842dcedfba 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -20,6 +20,7 @@ #include <linux/cpu_pm.h> #include <linux/sched/clock.h> #include <linux/soc/andes/irq.h> +#include <linux/workqueue.h> #include <asm/errata_list.h> #include <asm/sbi.h> @@ -114,7 +115,7 @@ struct sbi_pmu_event_data { }; }; -static const struct sbi_pmu_event_data pmu_hw_event_map[] = { +static struct sbi_pmu_event_data pmu_hw_event_map[] = { [PERF_COUNT_HW_CPU_CYCLES] = {.hw_gen_event = { SBI_PMU_HW_CPU_CYCLES, SBI_PMU_EVENT_TYPE_HW, 0}}, @@ -148,7 +149,7 @@ static const struct sbi_pmu_event_data pmu_hw_event_map[] = { }; #define C(x) PERF_COUNT_HW_CACHE_##x -static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX] +static struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = { [C(L1D)] = { @@ -293,6 +294,34 @@ static const struct sbi_pmu_event_data pmu_cache_event_map[PERF_COUNT_HW_CACHE_M }, }; +static void pmu_sbi_check_event(struct sbi_pmu_event_data *edata) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, + 0, cmask, 0, edata->event_idx, 0, 0); + if (!ret.error) { + sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, + ret.value, 0x1, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0); + } else if (ret.error == SBI_ERR_NOT_SUPPORTED) { + /* This event cannot be monitored by any counter */ + edata->event_idx = -EINVAL; + } +} + +static void pmu_sbi_check_std_events(struct work_struct *work) +{ + for (int i = 0; i < ARRAY_SIZE(pmu_hw_event_map); i++) + pmu_sbi_check_event(&pmu_hw_event_map[i]); + + for (int i = 0; i < ARRAY_SIZE(pmu_cache_event_map); i++) + for (int j = 0; j < ARRAY_SIZE(pmu_cache_event_map[i]); j++) + for (int k = 0; k < ARRAY_SIZE(pmu_cache_event_map[i][j]); k++) + pmu_sbi_check_event(&pmu_cache_event_map[i][j][k]); +} + +static DECLARE_WORK(check_std_events_work, pmu_sbi_check_std_events); + static int pmu_sbi_ctr_get_width(int idx) { return pmu_ctr_list[idx].width; @@ -478,6 +507,12 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) u64 raw_config_val; int ret; + /* + * Ensure we are finished checking standard hardware events for + * validity before allowing userspace to configure any events. + */ + flush_work(&check_std_events_work); + switch (type) { case PERF_TYPE_HARDWARE: if (config >= PERF_COUNT_HW_MAX) @@ -762,7 +797,7 @@ static inline void pmu_sbi_stop_all(struct riscv_pmu *pmu) * which may include counters that are not enabled yet. */ sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, - 0, pmu->cmask, 0, 0, 0, 0); + 0, pmu->cmask, SBI_PMU_STOP_FLAG_RESET, 0, 0, 0); } static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu) @@ -1359,6 +1394,9 @@ static int pmu_sbi_device_probe(struct platform_device *pdev) if (ret) goto out_unregister; + /* Asynchronously check which standard events are available */ + schedule_work(&check_std_events_work); + return 0; out_unregister: diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 3a8d8df89186..78a5aac2dcfd 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -3271,7 +3271,7 @@ static const char *find_hci_method(acpi_handle handle) */ #define QUIRK_HCI_HOTKEY_QUICKSTART BIT(1) -static const struct dmi_system_id toshiba_dmi_quirks[] = { +static const struct dmi_system_id toshiba_dmi_quirks[] __initconst = { { /* Toshiba Portégé R700 */ /* https://bugzilla.kernel.org/show_bug.cgi?id=21012 */ @@ -3299,6 +3299,7 @@ static const struct dmi_system_id toshiba_dmi_quirks[] = { }, .driver_data = (void *)(QUIRK_TURN_ON_PANEL_ON_RESUME | QUIRK_HCI_HOTKEY_QUICKSTART), }, + { } }; static int toshiba_acpi_add(struct acpi_device *acpi_dev) @@ -3306,8 +3307,6 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) struct toshiba_acpi_dev *dev; const char *hci_method; u32 dummy; - const struct dmi_system_id *dmi_id; - long quirks = 0; int ret = 0; if (toshiba_acpi) @@ -3460,16 +3459,6 @@ iio_error: } #endif - dmi_id = dmi_first_match(toshiba_dmi_quirks); - if (dmi_id) - quirks = (long)dmi_id->driver_data; - - if (turn_on_panel_on_resume == -1) - turn_on_panel_on_resume = !!(quirks & QUIRK_TURN_ON_PANEL_ON_RESUME); - - if (hci_hotkey_quickstart == -1) - hci_hotkey_quickstart = !!(quirks & QUIRK_HCI_HOTKEY_QUICKSTART); - toshiba_wwan_available(dev); if (dev->wwan_supported) toshiba_acpi_setup_wwan_rfkill(dev); @@ -3618,10 +3607,27 @@ static struct acpi_driver toshiba_acpi_driver = { .drv.pm = &toshiba_acpi_pm, }; +static void __init toshiba_dmi_init(void) +{ + const struct dmi_system_id *dmi_id; + long quirks = 0; + + dmi_id = dmi_first_match(toshiba_dmi_quirks); + if (dmi_id) + quirks = (long)dmi_id->driver_data; + + if (turn_on_panel_on_resume == -1) + turn_on_panel_on_resume = !!(quirks & QUIRK_TURN_ON_PANEL_ON_RESUME); + + if (hci_hotkey_quickstart == -1) + hci_hotkey_quickstart = !!(quirks & QUIRK_HCI_HOTKEY_QUICKSTART); +} + static int __init toshiba_acpi_init(void) { int ret; + toshiba_dmi_init(); toshiba_proc_dir = proc_mkdir(PROC_TOSHIBA, acpi_root_dir); if (!toshiba_proc_dir) { pr_err("Unable to create proc dir " PROC_TOSHIBA "\n"); diff --git a/drivers/pmdomain/qcom/rpmhpd.c b/drivers/pmdomain/qcom/rpmhpd.c index de9121ef4216..d2cb4271a1ca 100644 --- a/drivers/pmdomain/qcom/rpmhpd.c +++ b/drivers/pmdomain/qcom/rpmhpd.c @@ -40,6 +40,7 @@ * @addr: Resource address as looped up using resource name from * cmd-db * @state_synced: Indicator that sync_state has been invoked for the rpmhpd resource + * @skip_retention_level: Indicate that retention level should not be used for the power domain */ struct rpmhpd { struct device *dev; @@ -56,6 +57,7 @@ struct rpmhpd { const char *res_name; u32 addr; bool state_synced; + bool skip_retention_level; }; struct rpmhpd_desc { @@ -173,6 +175,7 @@ static struct rpmhpd mxc = { .pd = { .name = "mxc", }, .peer = &mxc_ao, .res_name = "mxc.lvl", + .skip_retention_level = true, }; static struct rpmhpd mxc_ao = { @@ -180,6 +183,7 @@ static struct rpmhpd mxc_ao = { .active_only = true, .peer = &mxc, .res_name = "mxc.lvl", + .skip_retention_level = true, }; static struct rpmhpd nsp = { @@ -819,6 +823,9 @@ static int rpmhpd_update_level_mapping(struct rpmhpd *rpmhpd) return -EINVAL; for (i = 0; i < rpmhpd->level_count; i++) { + if (rpmhpd->skip_retention_level && buf[i] == RPMH_REGULATOR_LEVEL_RETENTION) + continue; + rpmhpd->level[i] = buf[i]; /* Remember the first corner with non-zero level */ diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index 7112f5932609..6bb5d9e372e4 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -68,6 +68,7 @@ config RESET_BRCMSTB_RESCAL config RESET_GPIO tristate "GPIO reset controller" + depends on GPIOLIB help This enables a generic reset controller for resets attached via GPIOs. Typically for OF platforms this driver expects "reset-gpios" diff --git a/drivers/reset/hisilicon/hi6220_reset.c b/drivers/reset/hisilicon/hi6220_reset.c index 5c3267acd2b1..65aa5ff5ed82 100644 --- a/drivers/reset/hisilicon/hi6220_reset.c +++ b/drivers/reset/hisilicon/hi6220_reset.c @@ -219,4 +219,5 @@ static int __init hi6220_reset_init(void) postcore_initcall(hi6220_reset_init); +MODULE_DESCRIPTION("Hisilicon Hi6220 reset controller driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 2f16f543079b..a76c6af9ea63 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -4906,7 +4906,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req) ccw++; if (dst) { if (ccw->flags & CCW_FLAG_IDA) - cda = *((char **)dma32_to_virt(ccw->cda)); + cda = dma64_to_virt(*((dma64_t *)dma32_to_virt(ccw->cda))); else cda = dma32_to_virt(ccw->cda); if (dst != cda) { @@ -5525,7 +5525,7 @@ dasd_eckd_dump_ccw_range(struct dasd_device *device, struct ccw1 *from, /* get pointer to data (consider IDALs) */ if (from->flags & CCW_FLAG_IDA) - datap = (char *)*((addr_t *)dma32_to_virt(from->cda)); + datap = dma64_to_virt(*((dma64_t *)dma32_to_virt(from->cda))); else datap = dma32_to_virt(from->cda); diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 361e9bd75257..9f2023a077c2 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -585,7 +585,7 @@ dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req) ccw++; if (dst) { if (ccw->flags & CCW_FLAG_IDA) - cda = *((char **)dma32_to_virt(ccw->cda)); + cda = dma64_to_virt(*((dma64_t *)dma32_to_virt(ccw->cda))); else cda = dma32_to_virt(ccw->cda); if (dst != cda) { diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 4533dd055ca8..1aa426b1dedd 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -68,7 +68,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) blk_mq_free_tag_set(&block->tag_set); return PTR_ERR(gdp); } - blk_queue_flag_set(QUEUE_FLAG_NONROT, gdp->queue); /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 6d1689a2717e..d5a5d11ae0dc 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -548,6 +548,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char { struct queue_limits lim = { .logical_block_size = 4096, + .features = BLK_FEAT_DAX, }; int rc, i, j, num_of_segments; struct dcssblk_dev_info *dev_info; @@ -643,7 +644,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->gd->fops = &dcssblk_devops; dev_info->gd->private_data = dev_info; dev_info->gd->flags |= GENHD_FL_NO_PART; - blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->gd->queue); seg_byte_size = (dev_info->end - dev_info->start + 1); set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 1d456a5a3bfb..3fcfe029db1b 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -439,7 +439,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) .logical_block_size = 1 << 12, }; unsigned int devindex; - struct request_queue *rq; int len, ret; lim.max_segments = min(scmdev->nr_max_block, @@ -474,10 +473,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) ret = PTR_ERR(bdev->gendisk); goto out_tag; } - rq = bdev->rq = bdev->gendisk->queue; - blk_queue_flag_set(QUEUE_FLAG_NONROT, rq); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, rq); - bdev->gendisk->private_data = scmdev; bdev->gendisk->fops = &scm_blk_devops; bdev->gendisk->major = scm_major; diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 6e5c508b1e07..5f6e10225627 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -490,13 +490,14 @@ static int ccwchain_fetch_tic(struct ccw1 *ccw, struct channel_program *cp) { struct ccwchain *iter; - u32 cda, ccw_head; + u32 offset, ccw_head; list_for_each_entry(iter, &cp->ccwchain_list, next) { ccw_head = iter->ch_iova; if (is_cpa_within_range(ccw->cda, ccw_head, iter->ch_len)) { - cda = (u64)iter->ch_ccw + dma32_to_u32(ccw->cda) - ccw_head; - ccw->cda = u32_to_dma32(cda); + /* Calculate offset of TIC target */ + offset = dma32_to_u32(ccw->cda) - ccw_head; + ccw->cda = virt_to_dma32((void *)iter->ch_ccw + offset); return 0; } } @@ -914,7 +915,7 @@ void cp_update_scsw(struct channel_program *cp, union scsw *scsw) * in the ioctl directly. Path status changes etc. */ list_for_each_entry(chain, &cp->ccwchain_list, next) { - ccw_head = (u32)(u64)chain->ch_ccw; + ccw_head = dma32_to_u32(virt_to_dma32(chain->ch_ccw)); /* * On successful execution, cpa points just beyond the end * of the chain. diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 065db86d6021..37c24ffea65c 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -82,7 +82,6 @@ comment "SCSI support type (disk, tape, CD-ROM)" config BLK_DEV_SD tristate "SCSI disk support" depends on SCSI - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY help If you want to use SCSI hard disks, Fibre Channel disks, Serial ATA (SATA) or Parallel ATA (PATA) hard disks, diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 60688f18fac6..c708e1059638 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -1057,15 +1057,15 @@ static umode_t iscsi_sw_tcp_attr_is_visible(int param_type, int param) return 0; } -static int iscsi_sw_tcp_slave_configure(struct scsi_device *sdev) +static int iscsi_sw_tcp_device_configure(struct scsi_device *sdev, + struct queue_limits *lim) { struct iscsi_sw_tcp_host *tcp_sw_host = iscsi_host_priv(sdev->host); struct iscsi_session *session = tcp_sw_host->session; struct iscsi_conn *conn = session->leadconn; if (conn->datadgst_en) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, - sdev->request_queue); + lim->features |= BLK_FEAT_STABLE_WRITES; return 0; } @@ -1083,7 +1083,7 @@ static const struct scsi_host_template iscsi_sw_tcp_sht = { .eh_device_reset_handler= iscsi_eh_device_reset, .eh_target_reset_handler = iscsi_eh_recover_target, .dma_boundary = PAGE_SIZE - 1, - .slave_configure = iscsi_sw_tcp_slave_configure, + .device_configure = iscsi_sw_tcp_device_configure, .proc_name = "iscsi_tcp", .this_id = -1, .track_queue_depth = 1, diff --git a/drivers/scsi/libsas/sas_internal.h b/drivers/scsi/libsas/sas_internal.h index 85948963fb97..03d6ec1eb970 100644 --- a/drivers/scsi/libsas/sas_internal.h +++ b/drivers/scsi/libsas/sas_internal.h @@ -145,6 +145,20 @@ static inline void sas_fail_probe(struct domain_device *dev, const char *func, i func, dev->parent ? "exp-attached" : "direct-attached", SAS_ADDR(dev->sas_addr), err); + + /* + * If the device probe failed, the expander phy attached address + * needs to be reset so that the phy will not be treated as flutter + * in the next revalidation + */ + if (dev->parent && !dev_is_expander(dev->dev_type)) { + struct sas_phy *phy = dev->phy; + struct domain_device *parent = dev->parent; + struct ex_phy *ex_phy = &parent->ex_dev.ex_phy[phy->number]; + + memset(ex_phy->attached_sas_addr, 0, SAS_ADDR_SIZE); + } + sas_unregister_dev(dev->port, dev); } diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c index 5297cacc8beb..0cef5d089f34 100644 --- a/drivers/scsi/lpfc/lpfc_nvmet.c +++ b/drivers/scsi/lpfc/lpfc_nvmet.c @@ -1363,6 +1363,16 @@ lpfc_nvmet_ls_abort(struct nvmet_fc_target_port *targetport, atomic_inc(&lpfc_nvmet->xmt_ls_abort); } +static int +lpfc_nvmet_host_traddr(void *hosthandle, u64 *wwnn, u64 *wwpn) +{ + struct lpfc_nodelist *ndlp = hosthandle; + + *wwnn = wwn_to_u64(ndlp->nlp_nodename.u.wwn); + *wwpn = wwn_to_u64(ndlp->nlp_portname.u.wwn); + return 0; +} + static void lpfc_nvmet_host_release(void *hosthandle) { @@ -1413,6 +1423,7 @@ static struct nvmet_fc_target_template lpfc_tgttemplate = { .ls_req = lpfc_nvmet_ls_req, .ls_abort = lpfc_nvmet_ls_abort, .host_release = lpfc_nvmet_host_release, + .host_traddr = lpfc_nvmet_host_traddr, .max_hw_queues = 1, .max_sgl_segments = LPFC_NVMET_DEFAULT_SEGS, diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 88acefbf9aea..6c79c350a4d5 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -1981,8 +1981,6 @@ megasas_set_nvme_device_properties(struct scsi_device *sdev, lim->max_hw_sectors = max_io_size / 512; lim->virt_boundary_mask = mr_nvme_pg_size - 1; - - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, sdev->request_queue); } /* diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 870ec2cb4af4..97c2472cd434 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2680,12 +2680,6 @@ scsih_device_configure(struct scsi_device *sdev, struct queue_limits *lim) pcie_device_put(pcie_device); spin_unlock_irqrestore(&ioc->pcie_device_lock, flags); mpt3sas_scsih_change_queue_depth(sdev, qdepth); - /* Enable QUEUE_FLAG_NOMERGES flag, so that IOs won't be - ** merged and can eliminate holes created during merging - ** operation. - **/ - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, - sdev->request_queue); lim->virt_boundary_mask = ioc->page_size - 1; return 0; } diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index acf0592d63da..a9d8a9c62663 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -69,6 +69,8 @@ static const char *sdebug_version_date = "20210520"; /* Additional Sense Code (ASC) */ #define NO_ADDITIONAL_SENSE 0x0 +#define OVERLAP_ATOMIC_COMMAND_ASC 0x0 +#define OVERLAP_ATOMIC_COMMAND_ASCQ 0x23 #define LOGICAL_UNIT_NOT_READY 0x4 #define LOGICAL_UNIT_COMMUNICATION_FAILURE 0x8 #define UNRECOVERED_READ_ERR 0x11 @@ -103,6 +105,7 @@ static const char *sdebug_version_date = "20210520"; #define READ_BOUNDARY_ASCQ 0x7 #define ATTEMPT_ACCESS_GAP 0x9 #define INSUFF_ZONE_ASCQ 0xe +/* see drivers/scsi/sense_codes.h */ /* Additional Sense Code Qualifier (ASCQ) */ #define ACK_NAK_TO 0x3 @@ -152,6 +155,12 @@ static const char *sdebug_version_date = "20210520"; #define DEF_VIRTUAL_GB 0 #define DEF_VPD_USE_HOSTNO 1 #define DEF_WRITESAME_LENGTH 0xFFFF +#define DEF_ATOMIC_WR 0 +#define DEF_ATOMIC_WR_MAX_LENGTH 8192 +#define DEF_ATOMIC_WR_ALIGN 2 +#define DEF_ATOMIC_WR_GRAN 2 +#define DEF_ATOMIC_WR_MAX_LENGTH_BNDRY (DEF_ATOMIC_WR_MAX_LENGTH) +#define DEF_ATOMIC_WR_MAX_BNDRY 128 #define DEF_STRICT 0 #define DEF_STATISTICS false #define DEF_SUBMIT_QUEUES 1 @@ -374,7 +383,9 @@ struct sdebug_host_info { /* There is an xarray of pointers to this struct's objects, one per host */ struct sdeb_store_info { - rwlock_t macc_lck; /* for atomic media access on this store */ + rwlock_t macc_data_lck; /* for media data access on this store */ + rwlock_t macc_meta_lck; /* for atomic media meta access on this store */ + rwlock_t macc_sector_lck; /* per-sector media data access on this store */ u8 *storep; /* user data storage (ram) */ struct t10_pi_tuple *dif_storep; /* protection info */ void *map_storep; /* provisioning map */ @@ -398,12 +409,20 @@ struct sdebug_defer { enum sdeb_defer_type defer_t; }; +struct sdebug_device_access_info { + bool atomic_write; + u64 lba; + u32 num; + struct scsi_cmnd *self; +}; + struct sdebug_queued_cmd { /* corresponding bit set in in_use_bm[] in owning struct sdebug_queue * instance indicates this slot is in use. */ struct sdebug_defer sd_dp; struct scsi_cmnd *scmd; + struct sdebug_device_access_info *i; }; struct sdebug_scsi_cmd { @@ -463,7 +482,8 @@ enum sdeb_opcode_index { SDEB_I_PRE_FETCH = 29, /* 10, 16 */ SDEB_I_ZONE_OUT = 30, /* 0x94+SA; includes no data xfer */ SDEB_I_ZONE_IN = 31, /* 0x95+SA; all have data-in */ - SDEB_I_LAST_ELEM_P1 = 32, /* keep this last (previous + 1) */ + SDEB_I_ATOMIC_WRITE_16 = 32, + SDEB_I_LAST_ELEM_P1 = 33, /* keep this last (previous + 1) */ }; @@ -497,7 +517,8 @@ static const unsigned char opcode_ind_arr[256] = { 0, 0, 0, SDEB_I_VERIFY, SDEB_I_PRE_FETCH, SDEB_I_SYNC_CACHE, 0, SDEB_I_WRITE_SAME, SDEB_I_ZONE_OUT, SDEB_I_ZONE_IN, 0, 0, - 0, 0, 0, 0, 0, 0, SDEB_I_SERV_ACT_IN_16, SDEB_I_SERV_ACT_OUT_16, + 0, 0, 0, 0, + SDEB_I_ATOMIC_WRITE_16, 0, SDEB_I_SERV_ACT_IN_16, SDEB_I_SERV_ACT_OUT_16, /* 0xa0; 0xa0->0xbf: 12 byte cdbs */ SDEB_I_REPORT_LUNS, SDEB_I_ATA_PT, 0, SDEB_I_MAINT_IN, SDEB_I_MAINT_OUT, 0, 0, 0, @@ -547,6 +568,7 @@ static int resp_write_buffer(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_sync_cache(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_pre_fetch(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_report_zones(struct scsi_cmnd *, struct sdebug_dev_info *); +static int resp_atomic_write(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_open_zone(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_close_zone(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_finish_zone(struct scsi_cmnd *, struct sdebug_dev_info *); @@ -788,6 +810,11 @@ static const struct opcode_info_t opcode_info_arr[SDEB_I_LAST_ELEM_P1 + 1] = { resp_report_zones, zone_in_iarr, /* ZONE_IN(16), REPORT ZONES) */ {16, 0x0 /* SA */, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xbf, 0xc7} }, +/* 31 */ + {0, 0x0, 0x0, F_D_OUT | FF_MEDIA_IO, + resp_atomic_write, NULL, /* ATOMIC WRITE 16 */ + {16, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff} }, /* sentinel */ {0xff, 0, 0, 0, NULL, NULL, /* terminating element */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }, @@ -835,6 +862,13 @@ static unsigned int sdebug_unmap_granularity = DEF_UNMAP_GRANULARITY; static unsigned int sdebug_unmap_max_blocks = DEF_UNMAP_MAX_BLOCKS; static unsigned int sdebug_unmap_max_desc = DEF_UNMAP_MAX_DESC; static unsigned int sdebug_write_same_length = DEF_WRITESAME_LENGTH; +static unsigned int sdebug_atomic_wr = DEF_ATOMIC_WR; +static unsigned int sdebug_atomic_wr_max_length = DEF_ATOMIC_WR_MAX_LENGTH; +static unsigned int sdebug_atomic_wr_align = DEF_ATOMIC_WR_ALIGN; +static unsigned int sdebug_atomic_wr_gran = DEF_ATOMIC_WR_GRAN; +static unsigned int sdebug_atomic_wr_max_length_bndry = + DEF_ATOMIC_WR_MAX_LENGTH_BNDRY; +static unsigned int sdebug_atomic_wr_max_bndry = DEF_ATOMIC_WR_MAX_BNDRY; static int sdebug_uuid_ctl = DEF_UUID_CTL; static bool sdebug_random = DEF_RANDOM; static bool sdebug_per_host_store = DEF_PER_HOST_STORE; @@ -926,6 +960,7 @@ static const int device_qfull_result = static const int condition_met_result = SAM_STAT_CONDITION_MET; static struct dentry *sdebug_debugfs_root; +static ASYNC_DOMAIN_EXCLUSIVE(sdebug_async_domain); static void sdebug_err_free(struct rcu_head *head) { @@ -1148,6 +1183,8 @@ static int sdebug_target_alloc(struct scsi_target *starget) if (!targetip) return -ENOMEM; + async_synchronize_full_domain(&sdebug_async_domain); + targetip->debugfs_entry = debugfs_create_dir(dev_name(&starget->dev), sdebug_debugfs_root); @@ -1174,7 +1211,8 @@ static void sdebug_target_destroy(struct scsi_target *starget) targetip = (struct sdebug_target_info *)starget->hostdata; if (targetip) { starget->hostdata = NULL; - async_schedule(sdebug_tartget_cleanup_async, targetip); + async_schedule_domain(sdebug_tartget_cleanup_async, targetip, + &sdebug_async_domain); } } @@ -1188,6 +1226,11 @@ static inline bool scsi_debug_lbp(void) (sdebug_lbpu || sdebug_lbpws || sdebug_lbpws10); } +static inline bool scsi_debug_atomic_write(void) +{ + return sdebug_fake_rw == 0 && sdebug_atomic_wr; +} + static void *lba2fake_store(struct sdeb_store_info *sip, unsigned long long lba) { @@ -1815,6 +1858,14 @@ static int inquiry_vpd_b0(unsigned char *arr) /* Maximum WRITE SAME Length */ put_unaligned_be64(sdebug_write_same_length, &arr[32]); + if (sdebug_atomic_wr) { + put_unaligned_be32(sdebug_atomic_wr_max_length, &arr[40]); + put_unaligned_be32(sdebug_atomic_wr_align, &arr[44]); + put_unaligned_be32(sdebug_atomic_wr_gran, &arr[48]); + put_unaligned_be32(sdebug_atomic_wr_max_length_bndry, &arr[52]); + put_unaligned_be32(sdebug_atomic_wr_max_bndry, &arr[56]); + } + return 0x3c; /* Mandatory page length for Logical Block Provisioning */ } @@ -3377,16 +3428,238 @@ static inline struct sdeb_store_info *devip2sip(struct sdebug_dev_info *devip, return xa_load(per_store_ap, devip->sdbg_host->si_idx); } +static inline void +sdeb_read_lock(rwlock_t *lock) +{ + if (sdebug_no_rwlock) + __acquire(lock); + else + read_lock(lock); +} + +static inline void +sdeb_read_unlock(rwlock_t *lock) +{ + if (sdebug_no_rwlock) + __release(lock); + else + read_unlock(lock); +} + +static inline void +sdeb_write_lock(rwlock_t *lock) +{ + if (sdebug_no_rwlock) + __acquire(lock); + else + write_lock(lock); +} + +static inline void +sdeb_write_unlock(rwlock_t *lock) +{ + if (sdebug_no_rwlock) + __release(lock); + else + write_unlock(lock); +} + +static inline void +sdeb_data_read_lock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_read_lock(&sip->macc_data_lck); +} + +static inline void +sdeb_data_read_unlock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_read_unlock(&sip->macc_data_lck); +} + +static inline void +sdeb_data_write_lock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_write_lock(&sip->macc_data_lck); +} + +static inline void +sdeb_data_write_unlock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_write_unlock(&sip->macc_data_lck); +} + +static inline void +sdeb_data_sector_read_lock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_read_lock(&sip->macc_sector_lck); +} + +static inline void +sdeb_data_sector_read_unlock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_read_unlock(&sip->macc_sector_lck); +} + +static inline void +sdeb_data_sector_write_lock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_write_lock(&sip->macc_sector_lck); +} + +static inline void +sdeb_data_sector_write_unlock(struct sdeb_store_info *sip) +{ + BUG_ON(!sip); + + sdeb_write_unlock(&sip->macc_sector_lck); +} + +/* + * Atomic locking: + * We simplify the atomic model to allow only 1x atomic write and many non- + * atomic reads or writes for all LBAs. + + * A RW lock has a similar bahaviour: + * Only 1x writer and many readers. + + * So use a RW lock for per-device read and write locking: + * An atomic access grabs the lock as a writer and non-atomic grabs the lock + * as a reader. + */ + +static inline void +sdeb_data_lock(struct sdeb_store_info *sip, bool atomic) +{ + if (atomic) + sdeb_data_write_lock(sip); + else + sdeb_data_read_lock(sip); +} + +static inline void +sdeb_data_unlock(struct sdeb_store_info *sip, bool atomic) +{ + if (atomic) + sdeb_data_write_unlock(sip); + else + sdeb_data_read_unlock(sip); +} + +/* Allow many reads but only 1x write per sector */ +static inline void +sdeb_data_sector_lock(struct sdeb_store_info *sip, bool do_write) +{ + if (do_write) + sdeb_data_sector_write_lock(sip); + else + sdeb_data_sector_read_lock(sip); +} + +static inline void +sdeb_data_sector_unlock(struct sdeb_store_info *sip, bool do_write) +{ + if (do_write) + sdeb_data_sector_write_unlock(sip); + else + sdeb_data_sector_read_unlock(sip); +} + +static inline void +sdeb_meta_read_lock(struct sdeb_store_info *sip) +{ + if (sdebug_no_rwlock) { + if (sip) + __acquire(&sip->macc_meta_lck); + else + __acquire(&sdeb_fake_rw_lck); + } else { + if (sip) + read_lock(&sip->macc_meta_lck); + else + read_lock(&sdeb_fake_rw_lck); + } +} + +static inline void +sdeb_meta_read_unlock(struct sdeb_store_info *sip) +{ + if (sdebug_no_rwlock) { + if (sip) + __release(&sip->macc_meta_lck); + else + __release(&sdeb_fake_rw_lck); + } else { + if (sip) + read_unlock(&sip->macc_meta_lck); + else + read_unlock(&sdeb_fake_rw_lck); + } +} + +static inline void +sdeb_meta_write_lock(struct sdeb_store_info *sip) +{ + if (sdebug_no_rwlock) { + if (sip) + __acquire(&sip->macc_meta_lck); + else + __acquire(&sdeb_fake_rw_lck); + } else { + if (sip) + write_lock(&sip->macc_meta_lck); + else + write_lock(&sdeb_fake_rw_lck); + } +} + +static inline void +sdeb_meta_write_unlock(struct sdeb_store_info *sip) +{ + if (sdebug_no_rwlock) { + if (sip) + __release(&sip->macc_meta_lck); + else + __release(&sdeb_fake_rw_lck); + } else { + if (sip) + write_unlock(&sip->macc_meta_lck); + else + write_unlock(&sdeb_fake_rw_lck); + } +} + /* Returns number of bytes copied or -1 if error. */ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp, - u32 sg_skip, u64 lba, u32 num, bool do_write, - u8 group_number) + u32 sg_skip, u64 lba, u32 num, u8 group_number, + bool do_write, bool atomic) { int ret; - u64 block, rest = 0; + u64 block; enum dma_data_direction dir; struct scsi_data_buffer *sdb = &scp->sdb; u8 *fsp; + int i; + + /* + * Even though reads are inherently atomic (in this driver), we expect + * the atomic flag only for writes. + */ + if (!do_write && atomic) + return -1; if (do_write) { dir = DMA_TO_DEVICE; @@ -3406,21 +3679,26 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp, fsp = sip->storep; block = do_div(lba, sdebug_store_sectors); - if (block + num > sdebug_store_sectors) - rest = block + num - sdebug_store_sectors; - ret = sg_copy_buffer(sdb->table.sgl, sdb->table.nents, + /* Only allow 1x atomic write or multiple non-atomic writes at any given time */ + sdeb_data_lock(sip, atomic); + for (i = 0; i < num; i++) { + /* We shouldn't need to lock for atomic writes, but do it anyway */ + sdeb_data_sector_lock(sip, do_write); + ret = sg_copy_buffer(sdb->table.sgl, sdb->table.nents, fsp + (block * sdebug_sector_size), - (num - rest) * sdebug_sector_size, sg_skip, do_write); - if (ret != (num - rest) * sdebug_sector_size) - return ret; - - if (rest) { - ret += sg_copy_buffer(sdb->table.sgl, sdb->table.nents, - fsp, rest * sdebug_sector_size, - sg_skip + ((num - rest) * sdebug_sector_size), - do_write); + sdebug_sector_size, sg_skip, do_write); + sdeb_data_sector_unlock(sip, do_write); + if (ret != sdebug_sector_size) { + ret += (i * sdebug_sector_size); + break; + } + sg_skip += sdebug_sector_size; + if (++block >= sdebug_store_sectors) + block = 0; } + ret = num * sdebug_sector_size; + sdeb_data_unlock(sip, atomic); return ret; } @@ -3596,70 +3874,6 @@ static int prot_verify_read(struct scsi_cmnd *scp, sector_t start_sec, return ret; } -static inline void -sdeb_read_lock(struct sdeb_store_info *sip) -{ - if (sdebug_no_rwlock) { - if (sip) - __acquire(&sip->macc_lck); - else - __acquire(&sdeb_fake_rw_lck); - } else { - if (sip) - read_lock(&sip->macc_lck); - else - read_lock(&sdeb_fake_rw_lck); - } -} - -static inline void -sdeb_read_unlock(struct sdeb_store_info *sip) -{ - if (sdebug_no_rwlock) { - if (sip) - __release(&sip->macc_lck); - else - __release(&sdeb_fake_rw_lck); - } else { - if (sip) - read_unlock(&sip->macc_lck); - else - read_unlock(&sdeb_fake_rw_lck); - } -} - -static inline void -sdeb_write_lock(struct sdeb_store_info *sip) -{ - if (sdebug_no_rwlock) { - if (sip) - __acquire(&sip->macc_lck); - else - __acquire(&sdeb_fake_rw_lck); - } else { - if (sip) - write_lock(&sip->macc_lck); - else - write_lock(&sdeb_fake_rw_lck); - } -} - -static inline void -sdeb_write_unlock(struct sdeb_store_info *sip) -{ - if (sdebug_no_rwlock) { - if (sip) - __release(&sip->macc_lck); - else - __release(&sdeb_fake_rw_lck); - } else { - if (sip) - write_unlock(&sip->macc_lck); - else - write_unlock(&sdeb_fake_rw_lck); - } -} - static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) { bool check_prot; @@ -3669,6 +3883,7 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) u64 lba; struct sdeb_store_info *sip = devip2sip(devip, true); u8 *cmd = scp->cmnd; + bool meta_data_locked = false; switch (cmd[0]) { case READ_16: @@ -3727,6 +3942,10 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) atomic_set(&sdeb_inject_pending, 0); } + /* + * When checking device access params, for reads we only check data + * versus what is set at init time, so no need to lock. + */ ret = check_device_access_params(scp, lba, num, false); if (ret) return ret; @@ -3746,29 +3965,33 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) return check_condition_result; } - sdeb_read_lock(sip); + if (sdebug_dev_is_zoned(devip) || + (sdebug_dix && scsi_prot_sg_count(scp))) { + sdeb_meta_read_lock(sip); + meta_data_locked = true; + } /* DIX + T10 DIF */ if (unlikely(sdebug_dix && scsi_prot_sg_count(scp))) { switch (prot_verify_read(scp, lba, num, ei_lba)) { case 1: /* Guard tag error */ if (cmd[1] >> 5 != 3) { /* RDPROTECT != 3 */ - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); mk_sense_buffer(scp, ABORTED_COMMAND, 0x10, 1); return check_condition_result; } else if (scp->prot_flags & SCSI_PROT_GUARD_CHECK) { - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); mk_sense_buffer(scp, ILLEGAL_REQUEST, 0x10, 1); return illegal_condition_result; } break; case 3: /* Reference tag error */ if (cmd[1] >> 5 != 3) { /* RDPROTECT != 3 */ - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); mk_sense_buffer(scp, ABORTED_COMMAND, 0x10, 3); return check_condition_result; } else if (scp->prot_flags & SCSI_PROT_REF_CHECK) { - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); mk_sense_buffer(scp, ILLEGAL_REQUEST, 0x10, 3); return illegal_condition_result; } @@ -3776,8 +3999,9 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) } } - ret = do_device_access(sip, scp, 0, lba, num, false, 0); - sdeb_read_unlock(sip); + ret = do_device_access(sip, scp, 0, lba, num, 0, false, false); + if (meta_data_locked) + sdeb_meta_read_unlock(sip); if (unlikely(ret == -1)) return DID_ERROR << 16; @@ -3967,6 +4191,7 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) u64 lba; struct sdeb_store_info *sip = devip2sip(devip, true); u8 *cmd = scp->cmnd; + bool meta_data_locked = false; switch (cmd[0]) { case WRITE_16: @@ -4025,10 +4250,17 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) "to DIF device\n"); } - sdeb_write_lock(sip); + if (sdebug_dev_is_zoned(devip) || + (sdebug_dix && scsi_prot_sg_count(scp)) || + scsi_debug_lbp()) { + sdeb_meta_write_lock(sip); + meta_data_locked = true; + } + ret = check_device_access_params(scp, lba, num, true); if (ret) { - sdeb_write_unlock(sip); + if (meta_data_locked) + sdeb_meta_write_unlock(sip); return ret; } @@ -4037,22 +4269,22 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) switch (prot_verify_write(scp, lba, num, ei_lba)) { case 1: /* Guard tag error */ if (scp->prot_flags & SCSI_PROT_GUARD_CHECK) { - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); mk_sense_buffer(scp, ILLEGAL_REQUEST, 0x10, 1); return illegal_condition_result; } else if (scp->cmnd[1] >> 5 != 3) { /* WRPROTECT != 3 */ - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); mk_sense_buffer(scp, ABORTED_COMMAND, 0x10, 1); return check_condition_result; } break; case 3: /* Reference tag error */ if (scp->prot_flags & SCSI_PROT_REF_CHECK) { - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); mk_sense_buffer(scp, ILLEGAL_REQUEST, 0x10, 3); return illegal_condition_result; } else if (scp->cmnd[1] >> 5 != 3) { /* WRPROTECT != 3 */ - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); mk_sense_buffer(scp, ABORTED_COMMAND, 0x10, 3); return check_condition_result; } @@ -4060,13 +4292,16 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) } } - ret = do_device_access(sip, scp, 0, lba, num, true, group); + ret = do_device_access(sip, scp, 0, lba, num, group, true, false); if (unlikely(scsi_debug_lbp())) map_region(sip, lba, num); + /* If ZBC zone then bump its write pointer */ if (sdebug_dev_is_zoned(devip)) zbc_inc_wp(devip, lba, num); - sdeb_write_unlock(sip); + if (meta_data_locked) + sdeb_meta_write_unlock(sip); + if (unlikely(-1 == ret)) return DID_ERROR << 16; else if (unlikely(sdebug_verbose && @@ -4176,7 +4411,8 @@ static int resp_write_scat(struct scsi_cmnd *scp, goto err_out; } - sdeb_write_lock(sip); + /* Just keep it simple and always lock for now */ + sdeb_meta_write_lock(sip); sg_off = lbdof_blen; /* Spec says Buffer xfer Length field in number of LBs in dout */ cum_lb = 0; @@ -4219,7 +4455,11 @@ static int resp_write_scat(struct scsi_cmnd *scp, } } - ret = do_device_access(sip, scp, sg_off, lba, num, true, group); + /* + * Write ranges atomically to keep as close to pre-atomic + * writes behaviour as possible. + */ + ret = do_device_access(sip, scp, sg_off, lba, num, group, true, true); /* If ZBC zone then bump its write pointer */ if (sdebug_dev_is_zoned(devip)) zbc_inc_wp(devip, lba, num); @@ -4258,7 +4498,7 @@ static int resp_write_scat(struct scsi_cmnd *scp, } ret = 0; err_out_unlock: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); err_out: kfree(lrdp); return ret; @@ -4277,14 +4517,16 @@ static int resp_write_same(struct scsi_cmnd *scp, u64 lba, u32 num, scp->device->hostdata, true); u8 *fs1p; u8 *fsp; + bool meta_data_locked = false; - sdeb_write_lock(sip); + if (sdebug_dev_is_zoned(devip) || scsi_debug_lbp()) { + sdeb_meta_write_lock(sip); + meta_data_locked = true; + } ret = check_device_access_params(scp, lba, num, true); - if (ret) { - sdeb_write_unlock(sip); - return ret; - } + if (ret) + goto out; if (unmap && scsi_debug_lbp()) { unmap_region(sip, lba, num); @@ -4295,6 +4537,7 @@ static int resp_write_same(struct scsi_cmnd *scp, u64 lba, u32 num, /* if ndob then zero 1 logical block, else fetch 1 logical block */ fsp = sip->storep; fs1p = fsp + (block * lb_size); + sdeb_data_write_lock(sip); if (ndob) { memset(fs1p, 0, lb_size); ret = 0; @@ -4302,8 +4545,8 @@ static int resp_write_same(struct scsi_cmnd *scp, u64 lba, u32 num, ret = fetch_to_dev_buffer(scp, fs1p, lb_size); if (-1 == ret) { - sdeb_write_unlock(sip); - return DID_ERROR << 16; + ret = DID_ERROR << 16; + goto out; } else if (sdebug_verbose && !ndob && (ret < lb_size)) sdev_printk(KERN_INFO, scp->device, "%s: %s: lb size=%u, IO sent=%d bytes\n", @@ -4320,10 +4563,12 @@ static int resp_write_same(struct scsi_cmnd *scp, u64 lba, u32 num, /* If ZBC zone then bump its write pointer */ if (sdebug_dev_is_zoned(devip)) zbc_inc_wp(devip, lba, num); + sdeb_data_write_unlock(sip); + ret = 0; out: - sdeb_write_unlock(sip); - - return 0; + if (meta_data_locked) + sdeb_meta_write_unlock(sip); + return ret; } static int resp_write_same_10(struct scsi_cmnd *scp, @@ -4466,25 +4711,30 @@ static int resp_comp_write(struct scsi_cmnd *scp, return check_condition_result; } - sdeb_write_lock(sip); - ret = do_dout_fetch(scp, dnum, arr); if (ret == -1) { retval = DID_ERROR << 16; - goto cleanup; + goto cleanup_free; } else if (sdebug_verbose && (ret < (dnum * lb_size))) sdev_printk(KERN_INFO, scp->device, "%s: compare_write: cdb " "indicated=%u, IO sent=%d bytes\n", my_name, dnum * lb_size, ret); + + sdeb_data_write_lock(sip); + sdeb_meta_write_lock(sip); if (!comp_write_worker(sip, lba, num, arr, false)) { mk_sense_buffer(scp, MISCOMPARE, MISCOMPARE_VERIFY_ASC, 0); retval = check_condition_result; - goto cleanup; + goto cleanup_unlock; } + + /* Cover sip->map_storep (which map_region()) sets with data lock */ if (scsi_debug_lbp()) map_region(sip, lba, num); -cleanup: - sdeb_write_unlock(sip); +cleanup_unlock: + sdeb_meta_write_unlock(sip); + sdeb_data_write_unlock(sip); +cleanup_free: kfree(arr); return retval; } @@ -4528,7 +4778,7 @@ static int resp_unmap(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) desc = (void *)&buf[8]; - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); for (i = 0 ; i < descriptors ; i++) { unsigned long long lba = get_unaligned_be64(&desc[i].lba); @@ -4544,7 +4794,7 @@ static int resp_unmap(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) ret = 0; out: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); kfree(buf); return ret; @@ -4702,12 +4952,13 @@ static int resp_pre_fetch(struct scsi_cmnd *scp, rest = block + nblks - sdebug_store_sectors; /* Try to bring the PRE-FETCH range into CPU's cache */ - sdeb_read_lock(sip); + sdeb_data_read_lock(sip); prefetch_range(fsp + (sdebug_sector_size * block), (nblks - rest) * sdebug_sector_size); if (rest) prefetch_range(fsp, rest * sdebug_sector_size); - sdeb_read_unlock(sip); + + sdeb_data_read_unlock(sip); fini: if (cmd[1] & 0x2) res = SDEG_RES_IMMED_MASK; @@ -4866,7 +5117,7 @@ static int resp_verify(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) return check_condition_result; } /* Not changing store, so only need read access */ - sdeb_read_lock(sip); + sdeb_data_read_lock(sip); ret = do_dout_fetch(scp, a_num, arr); if (ret == -1) { @@ -4888,7 +5139,7 @@ static int resp_verify(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) goto cleanup; } cleanup: - sdeb_read_unlock(sip); + sdeb_data_read_unlock(sip); kfree(arr); return ret; } @@ -4934,7 +5185,7 @@ static int resp_report_zones(struct scsi_cmnd *scp, return check_condition_result; } - sdeb_read_lock(sip); + sdeb_meta_read_lock(sip); desc = arr + 64; for (lba = zs_lba; lba < sdebug_capacity; @@ -5032,11 +5283,70 @@ static int resp_report_zones(struct scsi_cmnd *scp, ret = fill_from_dev_buffer(scp, arr, min_t(u32, alloc_len, rep_len)); fini: - sdeb_read_unlock(sip); + sdeb_meta_read_unlock(sip); kfree(arr); return ret; } +static int resp_atomic_write(struct scsi_cmnd *scp, + struct sdebug_dev_info *devip) +{ + struct sdeb_store_info *sip; + u8 *cmd = scp->cmnd; + u16 boundary, len; + u64 lba, lba_tmp; + int ret; + + if (!scsi_debug_atomic_write()) { + mk_sense_invalid_opcode(scp); + return check_condition_result; + } + + sip = devip2sip(devip, true); + + lba = get_unaligned_be64(cmd + 2); + boundary = get_unaligned_be16(cmd + 10); + len = get_unaligned_be16(cmd + 12); + + lba_tmp = lba; + if (sdebug_atomic_wr_align && + do_div(lba_tmp, sdebug_atomic_wr_align)) { + /* Does not meet alignment requirement */ + mk_sense_buffer(scp, ILLEGAL_REQUEST, INVALID_FIELD_IN_CDB, 0); + return check_condition_result; + } + + if (sdebug_atomic_wr_gran && len % sdebug_atomic_wr_gran) { + /* Does not meet alignment requirement */ + mk_sense_buffer(scp, ILLEGAL_REQUEST, INVALID_FIELD_IN_CDB, 0); + return check_condition_result; + } + + if (boundary > 0) { + if (boundary > sdebug_atomic_wr_max_bndry) { + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 12, -1); + return check_condition_result; + } + + if (len > sdebug_atomic_wr_max_length_bndry) { + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 12, -1); + return check_condition_result; + } + } else { + if (len > sdebug_atomic_wr_max_length) { + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 12, -1); + return check_condition_result; + } + } + + ret = do_device_access(sip, scp, 0, lba, len, 0, true, true); + if (unlikely(ret == -1)) + return DID_ERROR << 16; + if (unlikely(ret != len * sdebug_sector_size)) + return DID_ERROR << 16; + return 0; +} + /* Logic transplanted from tcmu-runner, file_zbc.c */ static void zbc_open_all(struct sdebug_dev_info *devip) { @@ -5063,8 +5373,7 @@ static int resp_open_zone(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) mk_sense_invalid_opcode(scp); return check_condition_result; } - - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); if (all) { /* Check if all closed zones can be open */ @@ -5113,7 +5422,7 @@ static int resp_open_zone(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) zbc_open_zone(devip, zsp, true); fini: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); return res; } @@ -5140,7 +5449,7 @@ static int resp_close_zone(struct scsi_cmnd *scp, return check_condition_result; } - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); if (all) { zbc_close_all(devip); @@ -5169,7 +5478,7 @@ static int resp_close_zone(struct scsi_cmnd *scp, zbc_close_zone(devip, zsp); fini: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); return res; } @@ -5212,7 +5521,7 @@ static int resp_finish_zone(struct scsi_cmnd *scp, return check_condition_result; } - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); if (all) { zbc_finish_all(devip); @@ -5241,7 +5550,7 @@ static int resp_finish_zone(struct scsi_cmnd *scp, zbc_finish_zone(devip, zsp, true); fini: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); return res; } @@ -5292,7 +5601,7 @@ static int resp_rwp_zone(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) return check_condition_result; } - sdeb_write_lock(sip); + sdeb_meta_write_lock(sip); if (all) { zbc_rwp_all(devip); @@ -5320,7 +5629,7 @@ static int resp_rwp_zone(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) zbc_rwp_zone(devip, zsp); fini: - sdeb_write_unlock(sip); + sdeb_meta_write_unlock(sip); return res; } @@ -6284,6 +6593,7 @@ module_param_named(lbprz, sdebug_lbprz, int, S_IRUGO); module_param_named(lbpu, sdebug_lbpu, int, S_IRUGO); module_param_named(lbpws, sdebug_lbpws, int, S_IRUGO); module_param_named(lbpws10, sdebug_lbpws10, int, S_IRUGO); +module_param_named(atomic_wr, sdebug_atomic_wr, int, S_IRUGO); module_param_named(lowest_aligned, sdebug_lowest_aligned, int, S_IRUGO); module_param_named(lun_format, sdebug_lun_am_i, int, S_IRUGO | S_IWUSR); module_param_named(max_luns, sdebug_max_luns, int, S_IRUGO | S_IWUSR); @@ -6318,6 +6628,11 @@ module_param_named(unmap_alignment, sdebug_unmap_alignment, int, S_IRUGO); module_param_named(unmap_granularity, sdebug_unmap_granularity, int, S_IRUGO); module_param_named(unmap_max_blocks, sdebug_unmap_max_blocks, int, S_IRUGO); module_param_named(unmap_max_desc, sdebug_unmap_max_desc, int, S_IRUGO); +module_param_named(atomic_wr_max_length, sdebug_atomic_wr_max_length, int, S_IRUGO); +module_param_named(atomic_wr_align, sdebug_atomic_wr_align, int, S_IRUGO); +module_param_named(atomic_wr_gran, sdebug_atomic_wr_gran, int, S_IRUGO); +module_param_named(atomic_wr_max_length_bndry, sdebug_atomic_wr_max_length_bndry, int, S_IRUGO); +module_param_named(atomic_wr_max_bndry, sdebug_atomic_wr_max_bndry, int, S_IRUGO); module_param_named(uuid_ctl, sdebug_uuid_ctl, int, S_IRUGO); module_param_named(virtual_gb, sdebug_virtual_gb, int, S_IRUGO | S_IWUSR); module_param_named(vpd_use_hostno, sdebug_vpd_use_hostno, int, @@ -6361,6 +6676,7 @@ MODULE_PARM_DESC(lbprz, MODULE_PARM_DESC(lbpu, "enable LBP, support UNMAP command (def=0)"); MODULE_PARM_DESC(lbpws, "enable LBP, support WRITE SAME(16) with UNMAP bit (def=0)"); MODULE_PARM_DESC(lbpws10, "enable LBP, support WRITE SAME(10) with UNMAP bit (def=0)"); +MODULE_PARM_DESC(atomic_write, "enable ATOMIC WRITE support, support WRITE ATOMIC(16) (def=0)"); MODULE_PARM_DESC(lowest_aligned, "lowest aligned lba (def=0)"); MODULE_PARM_DESC(lun_format, "LUN format: 0->peripheral (def); 1 --> flat address method"); MODULE_PARM_DESC(max_luns, "number of LUNs per target to simulate(def=1)"); @@ -6392,6 +6708,11 @@ MODULE_PARM_DESC(unmap_alignment, "lowest aligned thin provisioning lba (def=0)" MODULE_PARM_DESC(unmap_granularity, "thin provisioning granularity in blocks (def=1)"); MODULE_PARM_DESC(unmap_max_blocks, "max # of blocks can be unmapped in one cmd (def=0xffffffff)"); MODULE_PARM_DESC(unmap_max_desc, "max # of ranges that can be unmapped in one cmd (def=256)"); +MODULE_PARM_DESC(atomic_wr_max_length, "max # of blocks can be atomically written in one cmd (def=8192)"); +MODULE_PARM_DESC(atomic_wr_align, "minimum alignment of atomic write in blocks (def=2)"); +MODULE_PARM_DESC(atomic_wr_gran, "minimum granularity of atomic write in blocks (def=2)"); +MODULE_PARM_DESC(atomic_wr_max_length_bndry, "max # of blocks can be atomically written in one cmd with boundary set (def=8192)"); +MODULE_PARM_DESC(atomic_wr_max_bndry, "max # boundaries per atomic write (def=128)"); MODULE_PARM_DESC(uuid_ctl, "1->use uuid for lu name, 0->don't, 2->all use same (def=0)"); MODULE_PARM_DESC(virtual_gb, "virtual gigabyte (GiB) size (def=0 -> use dev_size_mb)"); @@ -7563,6 +7884,7 @@ static int __init scsi_debug_init(void) return -EINVAL; } } + xa_init_flags(per_store_ap, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); if (want_store) { idx = sdebug_add_store(); @@ -7770,7 +8092,9 @@ static int sdebug_add_store(void) map_region(sip, 0, 2); } - rwlock_init(&sip->macc_lck); + rwlock_init(&sip->macc_data_lck); + rwlock_init(&sip->macc_meta_lck); + rwlock_init(&sip->macc_sector_lck); return (int)n_idx; err: sdebug_erase_store((int)n_idx, sip); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ec39acc986d6..3958a6d14bf4 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -631,8 +631,7 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (blk_update_request(req, error, bytes)) return true; - // XXX: - if (blk_queue_add_random(q)) + if (q->limits.features & BLK_FEAT_ADD_RANDOM) add_disk_randomness(req->q->disk); WARN_ON_ONCE(!blk_rq_is_passthrough(req) && @@ -1140,9 +1139,9 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) */ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); - if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) { + if (blk_rq_bytes(rq) & rq->q->limits.dma_pad_mask) { unsigned int pad_len = - (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; + (rq->q->limits.dma_pad_mask & ~blk_rq_bytes(rq)) + 1; last_sg->length += pad_len; cmd->extra_len += pad_len; @@ -1987,7 +1986,7 @@ void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim) shost->dma_alignment, dma_get_cache_alignment() - 1); if (shost->no_highmem) - lim->bounce = BLK_BOUNCE_HIGH; + lim->features |= BLK_FEAT_BOUNCE_HIGH; dma_set_seg_boundary(dev, shost->dma_boundary); dma_set_max_seg_size(dev, shost->max_segment_size); diff --git a/drivers/scsi/scsi_trace.c b/drivers/scsi/scsi_trace.c index 41a950075913..3e47c4472a80 100644 --- a/drivers/scsi/scsi_trace.c +++ b/drivers/scsi/scsi_trace.c @@ -326,6 +326,26 @@ out: } static const char * +scsi_trace_atomic_write16_out(struct trace_seq *p, unsigned char *cdb, int len) +{ + const char *ret = trace_seq_buffer_ptr(p); + unsigned int boundary_size; + unsigned int nr_blocks; + sector_t lba; + + lba = get_unaligned_be64(&cdb[2]); + boundary_size = get_unaligned_be16(&cdb[10]); + nr_blocks = get_unaligned_be16(&cdb[12]); + + trace_seq_printf(p, "lba=%llu txlen=%u boundary_size=%u", + lba, nr_blocks, boundary_size); + + trace_seq_putc(p, 0); + + return ret; +} + +static const char * scsi_trace_varlen(struct trace_seq *p, unsigned char *cdb, int len) { switch (SERVICE_ACTION32(cdb)) { @@ -385,6 +405,8 @@ scsi_trace_parse_cdb(struct trace_seq *p, unsigned char *cdb, int len) return scsi_trace_zbc_in(p, cdb, len); case ZBC_OUT: return scsi_trace_zbc_out(p, cdb, len); + case WRITE_ATOMIC_16: + return scsi_trace_atomic_write16_out(p, cdb, len); default: return scsi_trace_misc(p, cdb, len); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 6b64af7d4927..2e933fd1de70 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -102,12 +102,13 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 -static void sd_config_discard(struct scsi_disk *, unsigned int); -static void sd_config_write_same(struct scsi_disk *); +static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned int mode); +static void sd_config_write_same(struct scsi_disk *sdkp, + struct queue_limits *lim); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static void sd_shutdown(struct device *); -static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); static void scsi_disk_release(struct device *cdev); static DEFINE_IDA(sd_index_ida); @@ -120,17 +121,18 @@ static const char *sd_cache_types[] = { "write back, no read (daft)" }; -static void sd_set_flush_flag(struct scsi_disk *sdkp) +static void sd_set_flush_flag(struct scsi_disk *sdkp, + struct queue_limits *lim) { - bool wc = false, fua = false; - if (sdkp->WCE) { - wc = true; + lim->features |= BLK_FEAT_WRITE_CACHE; if (sdkp->DPOFUA) - fua = true; + lim->features |= BLK_FEAT_FUA; + else + lim->features &= ~BLK_FEAT_FUA; + } else { + lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); } - - blk_queue_write_cache(sdkp->disk->queue, wc, fua); } static ssize_t @@ -168,9 +170,18 @@ cache_type_store(struct device *dev, struct device_attribute *attr, wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0; if (sdkp->cache_override) { + struct queue_limits lim; + sdkp->WCE = wce; sdkp->RCD = rcd; - sd_set_flush_flag(sdkp); + + lim = queue_limits_start_update(sdkp->disk->queue); + sd_set_flush_flag(sdkp, &lim); + blk_mq_freeze_queue(sdkp->disk->queue); + ret = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (ret) + return ret; return count; } @@ -457,16 +468,12 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; - int mode; + struct queue_limits lim; + int mode, err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (sd_is_zoned(sdkp)) { - sd_config_discard(sdkp, SD_LBP_DISABLE); - return count; - } - if (sdp->type != TYPE_DISK) return -EINVAL; @@ -474,8 +481,13 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, if (mode < 0) return -EINVAL; - sd_config_discard(sdkp, mode); - + lim = queue_limits_start_update(sdkp->disk->queue); + sd_config_discard(sdkp, &lim, mode); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; return count; } static DEVICE_ATTR_RW(provisioning_mode); @@ -558,6 +570,7 @@ max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; + struct queue_limits lim; unsigned long max; int err; @@ -579,8 +592,13 @@ max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, sdkp->max_ws_blocks = max; } - sd_config_write_same(sdkp); - + lim = queue_limits_start_update(sdkp->disk->queue); + sd_config_write_same(sdkp, &lim); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; return count; } static DEVICE_ATTR_RW(max_write_same_blocks); @@ -823,25 +841,28 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, return protect; } -static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) +static void sd_disable_discard(struct scsi_disk *sdkp) +{ + sdkp->provisioning_mode = SD_LBP_DISABLE; + blk_queue_disable_discard(sdkp->disk->queue); +} + +static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned int mode) { - struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; - q->limits.discard_alignment = - sdkp->unmap_alignment * logical_block_size; - q->limits.discard_granularity = - max(sdkp->physical_block_size, - sdkp->unmap_granularity * logical_block_size); + lim->discard_alignment = sdkp->unmap_alignment * logical_block_size; + lim->discard_granularity = max(sdkp->physical_block_size, + sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { case SD_LBP_FULL: case SD_LBP_DISABLE: - blk_queue_max_discard_sectors(q, 0); - return; + break; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, @@ -872,7 +893,8 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) break; } - blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); + lim->max_hw_discard_sectors = max_blocks * + (logical_block_size >> SECTOR_SHIFT); } static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) @@ -918,6 +940,64 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) return scsi_alloc_sgtables(cmd); } +static void sd_config_atomic(struct scsi_disk *sdkp, struct queue_limits *lim) +{ + unsigned int logical_block_size = sdkp->device->sector_size, + physical_block_size_sectors, max_atomic, unit_min, unit_max; + + if ((!sdkp->max_atomic && !sdkp->max_atomic_with_boundary) || + sdkp->protection_type == T10_PI_TYPE2_PROTECTION) + return; + + physical_block_size_sectors = sdkp->physical_block_size / + sdkp->device->sector_size; + + unit_min = rounddown_pow_of_two(sdkp->atomic_granularity ? + sdkp->atomic_granularity : + physical_block_size_sectors); + + /* + * Only use atomic boundary when we have the odd scenario of + * sdkp->max_atomic == 0, which the spec does permit. + */ + if (sdkp->max_atomic) { + max_atomic = sdkp->max_atomic; + unit_max = rounddown_pow_of_two(sdkp->max_atomic); + sdkp->use_atomic_write_boundary = 0; + } else { + max_atomic = sdkp->max_atomic_with_boundary; + unit_max = rounddown_pow_of_two(sdkp->max_atomic_boundary); + sdkp->use_atomic_write_boundary = 1; + } + + /* + * Ensure compliance with granularity and alignment. For now, keep it + * simple and just don't support atomic writes for values mismatched + * with max_{boundary}atomic, physical block size, and + * atomic_granularity itself. + * + * We're really being distrustful by checking unit_max also... + */ + if (sdkp->atomic_granularity > 1) { + if (unit_min > 1 && unit_min % sdkp->atomic_granularity) + return; + if (unit_max > 1 && unit_max % sdkp->atomic_granularity) + return; + } + + if (sdkp->atomic_alignment > 1) { + if (unit_min > 1 && unit_min % sdkp->atomic_alignment) + return; + if (unit_max > 1 && unit_max % sdkp->atomic_alignment) + return; + } + + lim->atomic_write_hw_max = max_atomic * logical_block_size; + lim->atomic_write_hw_boundary = 0; + lim->atomic_write_hw_unit_min = unit_min * logical_block_size; + lim->atomic_write_hw_unit_max = unit_max * logical_block_size; +} + static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) { @@ -1000,9 +1080,16 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) return sd_setup_write_same10_cmnd(cmd, false); } -static void sd_config_write_same(struct scsi_disk *sdkp) +static void sd_disable_write_same(struct scsi_disk *sdkp) +{ + sdkp->device->no_write_same = 1; + sdkp->max_ws_blocks = 0; + blk_queue_disable_write_zeroes(sdkp->disk->queue); +} + +static void sd_config_write_same(struct scsi_disk *sdkp, + struct queue_limits *lim) { - struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { @@ -1056,8 +1143,8 @@ static void sd_config_write_same(struct scsi_disk *sdkp) } out: - blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks * - (logical_block_size >> 9)); + lim->max_write_zeroes_sectors = + sdkp->max_ws_blocks * (logical_block_size >> SECTOR_SHIFT); } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) @@ -1209,6 +1296,26 @@ static int sd_cdl_dld(struct scsi_disk *sdkp, struct scsi_cmnd *scmd) return (hint - IOPRIO_HINT_DEV_DURATION_LIMIT_1) + 1; } +static blk_status_t sd_setup_atomic_cmnd(struct scsi_cmnd *cmd, + sector_t lba, unsigned int nr_blocks, + bool boundary, unsigned char flags) +{ + cmd->cmd_len = 16; + cmd->cmnd[0] = WRITE_ATOMIC_16; + cmd->cmnd[1] = flags; + put_unaligned_be64(lba, &cmd->cmnd[2]); + put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); + if (boundary) + put_unaligned_be16(nr_blocks, &cmd->cmnd[10]); + else + put_unaligned_be16(0, &cmd->cmnd[10]); + put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); + cmd->cmnd[14] = 0; + cmd->cmnd[15] = 0; + + return BLK_STS_OK; +} + static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); @@ -1274,6 +1381,10 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); + } else if (rq->cmd_flags & REQ_ATOMIC && write) { + ret = sd_setup_atomic_cmnd(cmd, lba, nr_blocks, + sdkp->use_atomic_write_boundary, + protect | fua); } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) { ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); @@ -2247,15 +2358,14 @@ static int sd_done(struct scsi_cmnd *SCpnt) case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: - sd_config_discard(sdkp, SD_LBP_DISABLE); + sd_disable_discard(sdkp); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ - sd_config_discard(sdkp, SD_LBP_DISABLE); + sd_disable_discard(sdkp); } else { - sdkp->device->no_write_same = 1; - sd_config_write_same(sdkp); + sd_disable_write_same(sdkp); req->rq_flags |= RQF_QUIET; } break; @@ -2267,7 +2377,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) } out: - if (sd_is_zoned(sdkp)) + if (sdkp->device->type == TYPE_ZBC) good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, @@ -2461,11 +2571,13 @@ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer return 0; } -static void sd_config_protection(struct scsi_disk *sdkp) +static void sd_config_protection(struct scsi_disk *sdkp, + struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; - sd_dif_config_host(sdkp); + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + sd_dif_config_host(sdkp, lim); if (!sdkp->protection_type) return; @@ -2514,7 +2626,7 @@ static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, #define READ_CAPACITY_RETRIES_ON_RESET 10 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, - unsigned char *buffer) + struct queue_limits *lim, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; @@ -2588,7 +2700,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, /* Lowest aligned logical block */ alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size; - blk_queue_alignment_offset(sdp->request_queue, alignment); + lim->alignment_offset = alignment; if (alignment && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "physical block alignment offset: %u\n", alignment); @@ -2599,7 +2711,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; - sd_config_discard(sdkp, SD_LBP_WS16); + sd_config_discard(sdkp, lim, SD_LBP_WS16); } sdkp->capacity = lba + 1; @@ -2702,13 +2814,14 @@ static int sd_try_rc16_first(struct scsi_device *sdp) * read disk capacity */ static void -sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) +sd_read_capacity(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned char *buffer) { int sector_size; struct scsi_device *sdp = sdkp->device; if (sd_try_rc16_first(sdp)) { - sector_size = read_capacity_16(sdkp, sdp, buffer); + sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size == -ENODEV) @@ -2728,7 +2841,7 @@ sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) int old_sector_size = sector_size; sd_printk(KERN_NOTICE, sdkp, "Very big device. " "Trying to use READ CAPACITY(16).\n"); - sector_size = read_capacity_16(sdkp, sdp, buffer); + sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size < 0) { sd_printk(KERN_NOTICE, sdkp, "Using 0xffffffff as device size\n"); @@ -2787,9 +2900,8 @@ got_data: */ sector_size = 512; } - blk_queue_logical_block_size(sdp->request_queue, sector_size); - blk_queue_physical_block_size(sdp->request_queue, - sdkp->physical_block_size); + lim->logical_block_size = sector_size; + lim->physical_block_size = sdkp->physical_block_size; sdkp->device->sector_size = sector_size; if (sdkp->capacity > 0xffffffff) @@ -3195,11 +3307,30 @@ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) return; } -/** - * sd_read_block_limits - Query disk device for preferred I/O sizes. - * @sdkp: disk to query +static unsigned int sd_discard_mode(struct scsi_disk *sdkp) +{ + if (!sdkp->lbpvpd) { + /* LBP VPD page not provided */ + if (sdkp->max_unmap_blocks) + return SD_LBP_UNMAP; + return SD_LBP_WS16; + } + + /* LBP VPD page tells us what to use */ + if (sdkp->lbpu && sdkp->max_unmap_blocks) + return SD_LBP_UNMAP; + if (sdkp->lbpws) + return SD_LBP_WS16; + if (sdkp->lbpws10) + return SD_LBP_WS10; + return SD_LBP_DISABLE; +} + +/* + * Query disk device for preferred I/O sizes. */ -static void sd_read_block_limits(struct scsi_disk *sdkp) +static void sd_read_block_limits(struct scsi_disk *sdkp, + struct queue_limits *lim) { struct scsi_vpd *vpd; @@ -3219,7 +3350,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) sdkp->max_ws_blocks = (u32)get_unaligned_be64(&vpd->data[36]); if (!sdkp->lbpme) - goto out; + goto config_atomic; lba_count = get_unaligned_be32(&vpd->data[20]); desc_count = get_unaligned_be32(&vpd->data[24]); @@ -3233,23 +3364,16 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); - if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ - - if (sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); - else - sd_config_discard(sdkp, SD_LBP_WS16); - - } else { /* LBP VPD page tells us what to use */ - if (sdkp->lbpu && sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); - else if (sdkp->lbpws) - sd_config_discard(sdkp, SD_LBP_WS16); - else if (sdkp->lbpws10) - sd_config_discard(sdkp, SD_LBP_WS10); - else - sd_config_discard(sdkp, SD_LBP_DISABLE); - } + sd_config_discard(sdkp, lim, sd_discard_mode(sdkp)); + +config_atomic: + sdkp->max_atomic = get_unaligned_be32(&vpd->data[44]); + sdkp->atomic_alignment = get_unaligned_be32(&vpd->data[48]); + sdkp->atomic_granularity = get_unaligned_be32(&vpd->data[52]); + sdkp->max_atomic_with_boundary = get_unaligned_be32(&vpd->data[56]); + sdkp->max_atomic_boundary = get_unaligned_be32(&vpd->data[60]); + + sd_config_atomic(sdkp, lim); } out: @@ -3268,13 +3392,10 @@ static void sd_read_block_limits_ext(struct scsi_disk *sdkp) rcu_read_unlock(); } -/** - * sd_read_block_characteristics - Query block dev. characteristics - * @sdkp: disk to query - */ -static void sd_read_block_characteristics(struct scsi_disk *sdkp) +/* Query block device characteristics */ +static void sd_read_block_characteristics(struct scsi_disk *sdkp, + struct queue_limits *lim) { - struct request_queue *q = sdkp->disk->queue; struct scsi_vpd *vpd; u16 rot; @@ -3290,37 +3411,13 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp) sdkp->zoned = (vpd->data[8] >> 4) & 3; rcu_read_unlock(); - if (rot == 1) { - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); - } - - -#ifdef CONFIG_BLK_DEV_ZONED /* sd_probe rejects ZBD devices early otherwise */ - if (sdkp->device->type == TYPE_ZBC) { - /* - * Host-managed. - */ - disk_set_zoned(sdkp->disk); - - /* - * Per ZBC and ZAC specifications, writes in sequential write - * required zones of host-managed devices must be aligned to - * the device physical block size. - */ - blk_queue_zone_write_granularity(q, sdkp->physical_block_size); - } else { - /* - * Host-aware devices are treated as conventional. - */ - WARN_ON_ONCE(blk_queue_is_zoned(q)); - } -#endif /* CONFIG_BLK_DEV_ZONED */ + if (rot == 1) + lim->features &= ~(BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (!sdkp->first_scan) return; - if (blk_queue_is_zoned(q)) + if (sdkp->device->type == TYPE_ZBC) sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n"); else if (sdkp->zoned == 1) sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n"); @@ -3601,10 +3698,11 @@ static int sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; - struct request_queue *q = sdkp->disk->queue; sector_t old_capacity = sdkp->capacity; + struct queue_limits lim; unsigned char *buffer; - unsigned int dev_max, rw_max; + unsigned int dev_max; + int err; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); @@ -3625,12 +3723,14 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_spinup_disk(sdkp); + lim = queue_limits_start_update(sdkp->disk->queue); + /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { - sd_read_capacity(sdkp, buffer); + sd_read_capacity(sdkp, &lim, buffer); /* * Some USB/UAS devices return generic values for mode pages * until the media has been accessed. Trigger a READ operation @@ -3644,15 +3744,14 @@ static int sd_revalidate_disk(struct gendisk *disk) * cause this to be updated correctly and any device which * doesn't support it should be treated as rotational. */ - blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); - blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q); + lim.features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); - sd_read_block_limits(sdkp); + sd_read_block_limits(sdkp, &lim); sd_read_block_limits_ext(sdkp); - sd_read_block_characteristics(sdkp); - sd_zbc_read_zones(sdkp, buffer); + sd_read_block_characteristics(sdkp, &lim); + sd_zbc_read_zones(sdkp, &lim, buffer); sd_read_cpr(sdkp); } @@ -3664,64 +3763,50 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); - sd_config_protection(sdkp); + sd_config_protection(sdkp, &lim); } /* * We now have all cache related info, determine how we deal * with flush requests. */ - sd_set_flush_flag(sdkp); + sd_set_flush_flag(sdkp, &lim); /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); - q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); + lim.max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_min_xfer_size(sdkp)) - blk_queue_io_min(sdkp->disk->queue, - logical_to_bytes(sdp, sdkp->min_xfer_blocks)); + lim.io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); else - blk_queue_io_min(sdkp->disk->queue, 0); - - if (sd_validate_opt_xfer_size(sdkp, dev_max)) { - q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); - rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks); - } else { - q->limits.io_opt = 0; - rw_max = min_not_zero(logical_to_sectors(sdp, dev_max), - (sector_t)BLK_DEF_MAX_SECTORS_CAP); - } + lim.io_min = 0; /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ - rw_max = min_not_zero(rw_max, sdp->host->opt_sectors); - - /* Do not exceed controller limit */ - rw_max = min(rw_max, queue_max_hw_sectors(q)); - - /* - * Only update max_sectors if previously unset or if the current value - * exceeds the capabilities of the hardware. - */ - if (sdkp->first_scan || - q->limits.max_sectors > q->limits.max_dev_sectors || - q->limits.max_sectors > q->limits.max_hw_sectors) { - q->limits.max_sectors = rw_max; - q->limits.max_user_sectors = rw_max; + lim.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; + if (sd_validate_opt_xfer_size(sdkp, dev_max)) { + lim.io_opt = min_not_zero(lim.io_opt, + logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); - sd_config_write_same(sdkp); + sd_config_write_same(sdkp, &lim); kfree(buffer); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; + /* * For a zoned drive, revalidating the zones can be done only once * the gendisk capacity is set. So if this fails, set back the gendisk @@ -4119,8 +4204,6 @@ static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); - sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); - if (opal_unlock_from_suspend(sdkp->opal_dev)) { sd_printk(KERN_NOTICE, sdkp, "OPAL unlock failed\n"); return -EIO; @@ -4137,12 +4220,13 @@ static int sd_resume_common(struct device *dev, bool runtime) if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; + sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); + if (!sd_do_start_stop(sdkp->device, runtime)) { sdkp->suspended = false; return 0; } - sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); ret = sd_start_stop_device(sdkp, 1); if (!ret) { sd_resume(dev); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 49dd600bfa48..36382eca941c 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -115,6 +115,13 @@ struct scsi_disk { u32 max_unmap_blocks; u32 unmap_granularity; u32 unmap_alignment; + + u32 max_atomic; + u32 atomic_alignment; + u32 atomic_granularity; + u32 max_atomic_with_boundary; + u32 max_atomic_boundary; + u32 index; unsigned int physical_block_size; unsigned int max_medium_access_timeouts; @@ -148,6 +155,7 @@ struct scsi_disk { unsigned security : 1; unsigned ignore_medium_access_errors : 1; unsigned rscs : 1; /* reduced stream control support */ + unsigned use_atomic_write_boundary : 1; }; #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) @@ -220,26 +228,12 @@ static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sec return sector >> (ilog2(sdev->sector_size) - 9); } -#ifdef CONFIG_BLK_DEV_INTEGRITY - -extern void sd_dif_config_host(struct scsi_disk *); - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -static inline void sd_dif_config_host(struct scsi_disk *disk) -{ -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - -static inline int sd_is_zoned(struct scsi_disk *sdkp) -{ - return sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC; -} +void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim); #ifdef CONFIG_BLK_DEV_ZONED -int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); +int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, + u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); @@ -250,7 +244,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, #else /* CONFIG_BLK_DEV_ZONED */ -static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) +static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, + struct queue_limits *lim, u8 buf[SD_BUF_SIZE]) { return 0; } diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 1df847b5f747..ae6ce6f5d622 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -24,14 +24,15 @@ /* * Configure exchange of protection information between OS and HBA. */ -void sd_dif_config_host(struct scsi_disk *sdkp) +void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; - struct gendisk *disk = sdkp->disk; u8 type = sdkp->protection_type; - struct blk_integrity bi; + struct blk_integrity *bi = &lim->integrity; int dif, dix; + memset(bi, 0, sizeof(*bi)); + dif = scsi_host_dif_capable(sdp->host, type); dix = scsi_host_dix_capable(sdp->host, type); @@ -39,45 +40,33 @@ void sd_dif_config_host(struct scsi_disk *sdkp) dif = 0; dix = 1; } - if (!dix) { - blk_integrity_unregister(disk); + if (!dix) return; - } - - memset(&bi, 0, sizeof(bi)); /* Enable DMA of protection information */ - if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) { - if (type == T10_PI_TYPE3_PROTECTION) - bi.profile = &t10_pi_type3_ip; - else - bi.profile = &t10_pi_type1_ip; + if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) + bi->csum_type = BLK_INTEGRITY_CSUM_IP; + else + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; - bi.flags |= BLK_INTEGRITY_IP_CHECKSUM; - } else - if (type == T10_PI_TYPE3_PROTECTION) - bi.profile = &t10_pi_type3_crc; - else - bi.profile = &t10_pi_type1_crc; + if (type != T10_PI_TYPE3_PROTECTION) + bi->flags |= BLK_INTEGRITY_REF_TAG; - bi.tuple_size = sizeof(struct t10_pi_tuple); + bi->tuple_size = sizeof(struct t10_pi_tuple); if (dif && type) { - bi.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; if (!sdkp->ATO) - goto out; + return; if (type == T10_PI_TYPE3_PROTECTION) - bi.tag_size = sizeof(u16) + sizeof(u32); + bi->tag_size = sizeof(u16) + sizeof(u32); else - bi.tag_size = sizeof(u16); + bi->tag_size = sizeof(u16); } sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIX %s, application tag size %u bytes\n", - bi.profile->name, bi.tag_size); -out: - blk_integrity_register(disk, &bi); + blk_integrity_profile_name(bi), bi->tag_size); } - diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 806036e48abe..c8b9654d30f0 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -232,7 +232,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, int zone_idx = 0; int ret; - if (!sd_is_zoned(sdkp)) + if (sdkp->device->type != TYPE_ZBC) /* Not a zoned device */ return -EOPNOTSUPP; @@ -300,7 +300,7 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd) struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t sector = blk_rq_pos(rq); - if (!sd_is_zoned(sdkp)) + if (sdkp->device->type != TYPE_ZBC) /* Not a zoned device */ return BLK_STS_IOERR; @@ -521,7 +521,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf, static void sd_zbc_print_zones(struct scsi_disk *sdkp) { - if (!sd_is_zoned(sdkp) || !sdkp->capacity) + if (sdkp->device->type != TYPE_ZBC || !sdkp->capacity) return; if (sdkp->capacity & (sdkp->zone_info.zone_blocks - 1)) @@ -565,12 +565,6 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) sdkp->zone_info.zone_blocks = zone_blocks; sdkp->zone_info.nr_zones = nr_zones; - blk_queue_chunk_sectors(q, - logical_to_sectors(sdkp->device, zone_blocks)); - - /* Enable block layer zone append emulation */ - blk_queue_max_zone_append_sectors(q, 0); - flags = memalloc_noio_save(); ret = blk_revalidate_disk_zones(disk); memalloc_noio_restore(flags); @@ -588,27 +582,31 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) /** * sd_zbc_read_zones - Read zone information and update the request queue * @sdkp: SCSI disk pointer. + * @lim: queue limits to read into * @buf: 512 byte buffer used for storing SCSI command output. * * Read zone information and update the request queue zone characteristics and * also the zoned device information in *sdkp. Called by sd_revalidate_disk() * before the gendisk capacity has been set. */ -int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) +int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, + u8 buf[SD_BUF_SIZE]) { - struct gendisk *disk = sdkp->disk; - struct request_queue *q = disk->queue; unsigned int nr_zones; u32 zone_blocks = 0; int ret; - if (!sd_is_zoned(sdkp)) { - /* - * Device managed or normal SCSI disk, no special handling - * required. - */ + if (sdkp->device->type != TYPE_ZBC) return 0; - } + + lim->features |= BLK_FEAT_ZONED; + + /* + * Per ZBC and ZAC specifications, writes in sequential write required + * zones of host-managed devices must be aligned to the device physical + * block size. + */ + lim->zone_write_granularity = sdkp->physical_block_size; /* READ16/WRITE16/SYNC16 is mandatory for ZBC devices */ sdkp->device->use_16_for_rw = 1; @@ -625,18 +623,20 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) if (ret != 0) goto err; - /* The drive satisfies the kernel restrictions: set it up */ - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - if (sdkp->zones_max_open == U32_MAX) - disk_set_max_open_zones(disk, 0); - else - disk_set_max_open_zones(disk, sdkp->zones_max_open); - disk_set_max_active_zones(disk, 0); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); - sdkp->early_zone_info.nr_zones = nr_zones; sdkp->early_zone_info.zone_blocks = zone_blocks; + /* The drive satisfies the kernel restrictions: set it up */ + if (sdkp->zones_max_open == U32_MAX) + lim->max_open_zones = 0; + else + lim->max_open_zones = sdkp->zones_max_open; + lim->max_active_zones = 0; + lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks); + /* Enable block layer zone append emulation */ + lim->max_zone_append_sectors = 0; + return 0; err: diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 7ab000942b97..3f491019103e 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -111,7 +111,7 @@ static struct lock_class_key sr_bio_compl_lkclass; static int sr_open(struct cdrom_device_info *, int); static void sr_release(struct cdrom_device_info *); -static void get_sectorsize(struct scsi_cd *); +static int get_sectorsize(struct scsi_cd *); static int get_capabilities(struct scsi_cd *); static unsigned int sr_check_events(struct cdrom_device_info *cdi, @@ -473,15 +473,15 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) return BLK_STS_IOERR; } -static void sr_revalidate_disk(struct scsi_cd *cd) +static int sr_revalidate_disk(struct scsi_cd *cd) { struct scsi_sense_hdr sshdr; /* if the unit is not ready, nothing more to do */ if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr)) - return; + return 0; sr_cd_check(&cd->cdi); - get_sectorsize(cd); + return get_sectorsize(cd); } static int sr_block_open(struct gendisk *disk, blk_mode_t mode) @@ -494,13 +494,16 @@ static int sr_block_open(struct gendisk *disk, blk_mode_t mode) return -ENXIO; scsi_autopm_get_device(sdev); - if (disk_check_media_change(disk)) - sr_revalidate_disk(cd); + if (disk_check_media_change(disk)) { + ret = sr_revalidate_disk(cd); + if (ret) + goto out; + } mutex_lock(&cd->lock); ret = cdrom_open(&cd->cdi, mode); mutex_unlock(&cd->lock); - +out: scsi_autopm_put_device(sdev); if (ret) scsi_device_put(cd->device); @@ -685,7 +688,9 @@ static int sr_probe(struct device *dev) blk_pm_runtime_init(sdev->request_queue, dev); dev_set_drvdata(dev, cd); - sr_revalidate_disk(cd); + error = sr_revalidate_disk(cd); + if (error) + goto unregister_cdrom; error = device_add_disk(&sdev->sdev_gendev, disk, NULL); if (error) @@ -714,13 +719,14 @@ fail: } -static void get_sectorsize(struct scsi_cd *cd) +static int get_sectorsize(struct scsi_cd *cd) { + struct request_queue *q = cd->device->request_queue; static const u8 cmd[10] = { READ_CAPACITY }; unsigned char buffer[8] = { }; - int the_result; + struct queue_limits lim; + int err; int sector_size; - struct request_queue *queue; struct scsi_failure failure_defs[] = { { .result = SCMD_FAILURE_RESULT_ANY, @@ -736,10 +742,10 @@ static void get_sectorsize(struct scsi_cd *cd) }; /* Do the command and wait.. */ - the_result = scsi_execute_cmd(cd->device, cmd, REQ_OP_DRV_IN, buffer, + err = scsi_execute_cmd(cd->device, cmd, REQ_OP_DRV_IN, buffer, sizeof(buffer), SR_TIMEOUT, MAX_RETRIES, &exec_args); - if (the_result) { + if (err) { cd->capacity = 0x1fffff; sector_size = 2048; /* A guess, just in case */ } else { @@ -789,10 +795,12 @@ static void get_sectorsize(struct scsi_cd *cd) set_capacity(cd->disk, cd->capacity); } - queue = cd->device->request_queue; - blk_queue_logical_block_size(queue, sector_size); - - return; + lim = queue_limits_start_update(q); + lim.logical_block_size = sector_size; + blk_mq_freeze_queue(q); + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + return err; } static int get_capabilities(struct scsi_cd *cd) diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig index e6ba3573a772..f3f869639588 100644 --- a/drivers/soc/litex/Kconfig +++ b/drivers/soc/litex/Kconfig @@ -7,7 +7,7 @@ config LITEX config LITEX_SOC_CONTROLLER tristate "Enable LiteX SoC Controller driver" - depends on OF || COMPILE_TEST + depends on OF depends on HAS_IOMEM select LITEX help diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c index 10813299aa10..72c44119dd54 100644 --- a/drivers/soc/litex/litex_soc_ctrl.c +++ b/drivers/soc/litex/litex_soc_ctrl.c @@ -82,13 +82,11 @@ static int litex_reset_handler(struct notifier_block *this, unsigned long mode, return NOTIFY_DONE; } -#ifdef CONFIG_OF static const struct of_device_id litex_soc_ctrl_of_match[] = { {.compatible = "litex,soc-controller"}, {}, }; MODULE_DEVICE_TABLE(of, litex_soc_ctrl_of_match); -#endif /* CONFIG_OF */ static int litex_soc_ctrl_probe(struct platform_device *pdev) { @@ -130,7 +128,7 @@ static void litex_soc_ctrl_remove(struct platform_device *pdev) static struct platform_driver litex_soc_ctrl_driver = { .driver = { .name = "litex-soc-controller", - .of_match_table = of_match_ptr(litex_soc_ctrl_of_match) + .of_match_table = litex_soc_ctrl_of_match, }, .probe = litex_soc_ctrl_probe, .remove_new = litex_soc_ctrl_remove, diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c index 40fb09d69014..65279243072c 100644 --- a/drivers/soc/qcom/pmic_glink.c +++ b/drivers/soc/qcom/pmic_glink.c @@ -348,11 +348,15 @@ static void pmic_glink_remove(struct platform_device *pdev) mutex_unlock(&__pmic_glink_lock); } +static const unsigned long pmic_glink_sc8280xp_client_mask = BIT(PMIC_GLINK_CLIENT_BATT) | + BIT(PMIC_GLINK_CLIENT_ALTMODE); + static const unsigned long pmic_glink_sm8450_client_mask = BIT(PMIC_GLINK_CLIENT_BATT) | BIT(PMIC_GLINK_CLIENT_ALTMODE) | BIT(PMIC_GLINK_CLIENT_UCSI); static const struct of_device_id pmic_glink_of_match[] = { + { .compatible = "qcom,sc8280xp-pmic-glink", .data = &pmic_glink_sc8280xp_client_mask }, { .compatible = "qcom,pmic-glink", .data = &pmic_glink_sm8450_client_mask }, {} }; diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c index e358ac5b4509..96a524772549 100644 --- a/drivers/spi/spi-axi-spi-engine.c +++ b/drivers/spi/spi-axi-spi-engine.c @@ -164,16 +164,20 @@ static void spi_engine_gen_xfer(struct spi_engine_program *p, bool dry, } static void spi_engine_gen_sleep(struct spi_engine_program *p, bool dry, - int delay_ns, u32 sclk_hz) + int delay_ns, int inst_ns, u32 sclk_hz) { unsigned int t; - /* negative delay indicates error, e.g. from spi_delay_to_ns() */ - if (delay_ns <= 0) + /* + * Negative delay indicates error, e.g. from spi_delay_to_ns(). And if + * delay is less that the instruction execution time, there is no need + * for an extra sleep instruction since the instruction execution time + * will already cover the required delay. + */ + if (delay_ns < 0 || delay_ns <= inst_ns) return; - /* rounding down since executing the instruction adds a couple of ticks delay */ - t = DIV_ROUND_DOWN_ULL((u64)delay_ns * sclk_hz, NSEC_PER_SEC); + t = DIV_ROUND_UP_ULL((u64)(delay_ns - inst_ns) * sclk_hz, NSEC_PER_SEC); while (t) { unsigned int n = min(t, 256U); @@ -220,10 +224,16 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry, struct spi_device *spi = msg->spi; struct spi_controller *host = spi->controller; struct spi_transfer *xfer; - int clk_div, new_clk_div; + int clk_div, new_clk_div, inst_ns; bool keep_cs = false; u8 bits_per_word = 0; + /* + * Take into account instruction execution time for more accurate sleep + * times, especially when the delay is small. + */ + inst_ns = DIV_ROUND_UP(NSEC_PER_SEC, host->max_speed_hz); + clk_div = 1; spi_engine_program_add_cmd(p, dry, @@ -252,7 +262,7 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry, spi_engine_gen_xfer(p, dry, xfer); spi_engine_gen_sleep(p, dry, spi_delay_to_ns(&xfer->delay, xfer), - xfer->effective_speed_hz); + inst_ns, xfer->effective_speed_hz); if (xfer->cs_change) { if (list_is_last(&xfer->transfer_list, &msg->transfers)) { @@ -262,7 +272,7 @@ static void spi_engine_compile_message(struct spi_message *msg, bool dry, spi_engine_gen_cs(p, dry, spi, false); spi_engine_gen_sleep(p, dry, spi_delay_to_ns( - &xfer->cs_change_delay, xfer), + &xfer->cs_change_delay, xfer), inst_ns, xfer->effective_speed_hz); if (!list_next_entry(xfer, transfer_list)->cs_off) diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c index be3998104bfb..f7e8b5efa50e 100644 --- a/drivers/spi/spi-davinci.c +++ b/drivers/spi/spi-davinci.c @@ -984,6 +984,9 @@ static int davinci_spi_probe(struct platform_device *pdev) return ret; free_dma: + /* This bit needs to be cleared to disable dpsi->clk */ + clear_io_bits(dspi->base + SPIGCR1, SPIGCR1_POWERDOWN_MASK); + if (dspi->dma_rx) { dma_release_channel(dspi->dma_rx); dma_release_channel(dspi->dma_tx); @@ -1013,6 +1016,9 @@ static void davinci_spi_remove(struct platform_device *pdev) spi_bitbang_stop(&dspi->bitbang); + /* This bit needs to be cleared to disable dpsi->clk */ + clear_io_bits(dspi->base + SPIGCR1, SPIGCR1_POWERDOWN_MASK); + if (dspi->dma_rx) { dma_release_channel(dspi->dma_rx); dma_release_channel(dspi->dma_tx); diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 33164ebdb583..1439883326cf 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1050,7 +1050,7 @@ static struct spi_imx_devtype_data imx35_cspi_devtype_data = { .rx_available = mx31_rx_available, .reset = mx31_reset, .fifo_size = 8, - .has_dmamode = true, + .has_dmamode = false, .dynamic_burst = false, .has_targetmode = false, .devtype = IMX35_CSPI, diff --git a/drivers/spi/spi-mux.c b/drivers/spi/spi-mux.c index 5d72e3d59df8..c02c4204442f 100644 --- a/drivers/spi/spi-mux.c +++ b/drivers/spi/spi-mux.c @@ -158,12 +158,14 @@ static int spi_mux_probe(struct spi_device *spi) /* supported modes are the same as our parent's */ ctlr->mode_bits = spi->controller->mode_bits; ctlr->flags = spi->controller->flags; + ctlr->bits_per_word_mask = spi->controller->bits_per_word_mask; ctlr->transfer_one_message = spi_mux_transfer_one_message; ctlr->setup = spi_mux_setup; ctlr->num_chipselect = mux_control_states(priv->mux); ctlr->bus_num = -1; ctlr->dev.of_node = spi->dev.of_node; ctlr->must_async = true; + ctlr->defer_optimize_message = true; ret = devm_spi_register_controller(&spi->dev, ctlr); if (ret) diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c index 7e3083b83534..002f29dbcea6 100644 --- a/drivers/spi/spi-omap2-mcspi.c +++ b/drivers/spi/spi-omap2-mcspi.c @@ -1277,24 +1277,11 @@ static int omap2_mcspi_prepare_message(struct spi_controller *ctlr, /* * Check if this transfer contains only one word; - * OR contains 1 to 4 words, with bits_per_word == 8 and no delay between each word - * OR contains 1 to 2 words, with bits_per_word == 16 and no delay between each word - * - * If one of the two last case is true, this also change the bits_per_word of this - * transfer to make it a bit faster. - * It's not an issue to change the bits_per_word here even if the multi-mode is not - * applicable for this message, the signal on the wire will be the same. */ if (bits_per_word < 8 && tr->len == 1) { /* multi-mode is applicable, only one word (1..7 bits) */ - } else if (tr->word_delay.value == 0 && bits_per_word == 8 && tr->len <= 4) { - /* multi-mode is applicable, only one "bigger" word (8,16,24,32 bits) */ - tr->bits_per_word = tr->len * bits_per_word; - } else if (tr->word_delay.value == 0 && bits_per_word == 16 && tr->len <= 2) { - /* multi-mode is applicable, only one "bigger" word (16,32 bits) */ - tr->bits_per_word = tr->len * bits_per_word / 2; } else if (bits_per_word >= 8 && tr->len == bits_per_word / 8) { - /* multi-mode is applicable, only one word (9..15,17..32 bits) */ + /* multi-mode is applicable, only one word (8..32 bits) */ } else { /* multi-mode is not applicable: more than one word in the transfer */ mcspi->use_multi_mode = false; diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index fc13fa192189..0f04e832f9ec 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2151,7 +2151,8 @@ static void __spi_unoptimize_message(struct spi_message *msg) */ static void spi_maybe_unoptimize_message(struct spi_message *msg) { - if (!msg->pre_optimized && msg->optimized) + if (!msg->pre_optimized && msg->optimized && + !msg->spi->controller->defer_optimize_message) __spi_unoptimize_message(msg); } @@ -4294,6 +4295,11 @@ static int __spi_optimize_message(struct spi_device *spi, static int spi_maybe_optimize_message(struct spi_device *spi, struct spi_message *msg) { + if (spi->controller->defer_optimize_message) { + msg->spi = spi; + return 0; + } + if (msg->pre_optimized) return 0; @@ -4324,6 +4330,13 @@ int spi_optimize_message(struct spi_device *spi, struct spi_message *msg) { int ret; + /* + * Pre-optimization is not supported and optimization is deferred e.g. + * when using spi-mux. + */ + if (spi->controller->defer_optimize_message) + return 0; + ret = __spi_optimize_message(spi, msg); if (ret) return ret; @@ -4350,6 +4363,9 @@ EXPORT_SYMBOL_GPL(spi_optimize_message); */ void spi_unoptimize_message(struct spi_message *msg) { + if (msg->spi->controller->defer_optimize_message) + return; + __spi_unoptimize_message(msg); msg->pre_optimized = false; } @@ -4432,8 +4448,6 @@ int spi_async(struct spi_device *spi, struct spi_message *message) spin_unlock_irqrestore(&ctlr->bus_lock_spinlock, flags); - spi_maybe_unoptimize_message(message); - return ret; } EXPORT_SYMBOL_GPL(spi_async); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 7f6ca8177845..a3e09adc4e76 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -148,35 +148,38 @@ static int iblock_configure_device(struct se_device *dev) dev->dev_attrib.is_nonrot = 1; bi = bdev_get_integrity(bd); - if (bi) { - struct bio_set *bs = &ib_dev->ibd_bio_set; - - if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-IP") || - !strcmp(bi->profile->name, "T10-DIF-TYPE1-IP")) { - pr_err("IBLOCK export of blk_integrity: %s not" - " supported\n", bi->profile->name); - ret = -ENOSYS; - goto out_blkdev_put; - } + if (!bi) + return 0; - if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-CRC")) { - dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE3_PROT; - } else if (!strcmp(bi->profile->name, "T10-DIF-TYPE1-CRC")) { + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + pr_err("IBLOCK export of blk_integrity: %s not supported\n", + blk_integrity_profile_name(bi)); + ret = -ENOSYS; + goto out_blkdev_put; + case BLK_INTEGRITY_CSUM_CRC: + if (bi->flags & BLK_INTEGRITY_REF_TAG) dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE1_PROT; - } + else + dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE3_PROT; + break; + default: + break; + } - if (dev->dev_attrib.pi_prot_type) { - if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { - pr_err("Unable to allocate bioset for PI\n"); - ret = -ENOMEM; - goto out_blkdev_put; - } - pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", - &bs->bio_integrity_pool); + if (dev->dev_attrib.pi_prot_type) { + struct bio_set *bs = &ib_dev->ibd_bio_set; + + if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { + pr_err("Unable to allocate bioset for PI\n"); + ret = -ENOMEM; + goto out_blkdev_put; } - dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; + pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", + &bs->bio_integrity_pool); } + dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; return 0; out_blkdev_put: diff --git a/drivers/tee/optee/ffa_abi.c b/drivers/tee/optee/ffa_abi.c index 3235e1c719e8..3e73efa51bba 100644 --- a/drivers/tee/optee/ffa_abi.c +++ b/drivers/tee/optee/ffa_abi.c @@ -660,7 +660,9 @@ static bool optee_ffa_api_is_compatbile(struct ffa_device *ffa_dev, const struct ffa_ops *ops) { const struct ffa_msg_ops *msg_ops = ops->msg_ops; - struct ffa_send_direct_data data = { OPTEE_FFA_GET_API_VERSION }; + struct ffa_send_direct_data data = { + .data0 = OPTEE_FFA_GET_API_VERSION, + }; int rc; msg_ops->mode_32bit_set(ffa_dev); @@ -677,7 +679,9 @@ static bool optee_ffa_api_is_compatbile(struct ffa_device *ffa_dev, return false; } - data = (struct ffa_send_direct_data){ OPTEE_FFA_GET_OS_VERSION }; + data = (struct ffa_send_direct_data){ + .data0 = OPTEE_FFA_GET_OS_VERSION, + }; rc = msg_ops->sync_send_receive(ffa_dev, &data); if (rc) { pr_err("Unexpected error %d\n", rc); @@ -698,7 +702,9 @@ static bool optee_ffa_exchange_caps(struct ffa_device *ffa_dev, unsigned int *rpc_param_count, unsigned int *max_notif_value) { - struct ffa_send_direct_data data = { OPTEE_FFA_EXCHANGE_CAPABILITIES }; + struct ffa_send_direct_data data = { + .data0 = OPTEE_FFA_EXCHANGE_CAPABILITIES, + }; int rc; rc = ops->msg_ops->sync_send_receive(ffa_dev, &data); diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 45f04a25255a..1b2345a697c5 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -759,6 +759,9 @@ static void power_allocator_manage(struct thermal_zone_device *tz) return; } + if (!params->trip_max) + return; + allocate_power(tz, params->trip_max->temperature); params->update_cdevs = true; } diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 1b0ab2790860..ecc748d15eb7 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -300,6 +300,8 @@ static void monitor_thermal_zone(struct thermal_zone_device *tz) thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies); else if (tz->polling_delay_jiffies) thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies); + else if (tz->temperature == THERMAL_TEMP_INVALID) + thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS)); } static struct thermal_governor *thermal_get_tz_governor(struct thermal_zone_device *tz) @@ -482,16 +484,14 @@ static void thermal_trip_crossed(struct thermal_zone_device *tz, thermal_governor_trip_crossed(governor, tz, trip, crossed_up); } -static int thermal_trip_notify_cmp(void *ascending, const struct list_head *a, +static int thermal_trip_notify_cmp(void *not_used, const struct list_head *a, const struct list_head *b) { struct thermal_trip_desc *tda = container_of(a, struct thermal_trip_desc, notify_list_node); struct thermal_trip_desc *tdb = container_of(b, struct thermal_trip_desc, notify_list_node); - int ret = tdb->notify_temp - tda->notify_temp; - - return ascending ? ret : -ret; + return tda->notify_temp - tdb->notify_temp; } void __thermal_zone_device_update(struct thermal_zone_device *tz, @@ -511,7 +511,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz, update_temperature(tz); if (tz->temperature == THERMAL_TEMP_INVALID) - return; + goto monitor; __thermal_zone_set_trips(tz); @@ -520,12 +520,12 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz, for_each_trip_desc(tz, td) handle_thermal_trip(tz, td, &way_up_list, &way_down_list); - list_sort(&way_up_list, &way_up_list, thermal_trip_notify_cmp); + list_sort(NULL, &way_up_list, thermal_trip_notify_cmp); list_for_each_entry(td, &way_up_list, notify_list_node) thermal_trip_crossed(tz, &td->trip, governor, true); list_sort(NULL, &way_down_list, thermal_trip_notify_cmp); - list_for_each_entry(td, &way_down_list, notify_list_node) + list_for_each_entry_reverse(td, &way_down_list, notify_list_node) thermal_trip_crossed(tz, &td->trip, governor, false); if (governor->manage) @@ -533,6 +533,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz, thermal_debug_update_trip_stats(tz); +monitor: monitor_thermal_zone(tz); } diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 66f67e54e0c8..94eeb4011a48 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -133,6 +133,12 @@ struct thermal_zone_device { struct thermal_trip_desc trips[] __counted_by(num_trips); }; +/* + * Default delay after a failing thermal zone temperature check before + * attempting to check it again. + */ +#define THERMAL_RECHECK_DELAY_MS 250 + /* Default Thermal Governor */ #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) #define DEFAULT_THERMAL_GOVERNOR "step_wise" diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c index ddac0a13cf84..1af9aed99c65 100644 --- a/drivers/tty/serial/8250/8250_omap.c +++ b/drivers/tty/serial/8250/8250_omap.c @@ -672,7 +672,8 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id) * https://www.ti.com/lit/pdf/sprz536 */ if (priv->habit & UART_RX_TIMEOUT_QUIRK && - (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT) { + (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT && + serial_port_in(port, UART_OMAP_RX_LVL) == 0) { unsigned char efr2, timeout_h, timeout_l; efr2 = serial_in(up, UART_OMAP_EFR2); diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index f4f40c9373c2..ff32cd2d2863 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -120,6 +120,7 @@ #define UCR4_OREN (1<<1) /* Receiver overrun interrupt enable */ #define UCR4_DREN (1<<0) /* Recv data ready interrupt enable */ #define UFCR_RXTL_SHF 0 /* Receiver trigger level shift */ +#define UFCR_RXTL_MASK 0x3F /* Receiver trigger 6 bits wide */ #define UFCR_DCEDTE (1<<6) /* DCE/DTE mode select */ #define UFCR_RFDIV (7<<7) /* Reference freq divider mask */ #define UFCR_RFDIV_REG(x) (((x) < 7 ? 6 - (x) : 6) << 7) @@ -1551,6 +1552,7 @@ static void imx_uart_shutdown(struct uart_port *port) struct imx_port *sport = (struct imx_port *)port; unsigned long flags; u32 ucr1, ucr2, ucr4, uts; + int loops; if (sport->dma_is_enabled) { dmaengine_terminate_sync(sport->dma_chan_tx); @@ -1613,6 +1615,56 @@ static void imx_uart_shutdown(struct uart_port *port) ucr4 &= ~UCR4_TCEN; imx_uart_writel(sport, ucr4, UCR4); + /* + * We have to ensure the tx state machine ends up in OFF. This + * is especially important for rs485 where we must not leave + * the RTS signal high, blocking the bus indefinitely. + * + * All interrupts are now disabled, so imx_uart_stop_tx() will + * no longer be called from imx_uart_transmit_buffer(). It may + * still be called via the hrtimers, and if those are in play, + * we have to honour the delays. + */ + if (sport->tx_state == WAIT_AFTER_RTS || sport->tx_state == SEND) + imx_uart_stop_tx(port); + + /* + * In many cases (rs232 mode, or if tx_state was + * WAIT_AFTER_RTS, or if tx_state was SEND and there is no + * delay_rts_after_send), this will have moved directly to + * OFF. In rs485 mode, tx_state might already have been + * WAIT_AFTER_SEND and the hrtimer thus already started, or + * the above imx_uart_stop_tx() call could have started it. In + * those cases, we have to wait for the hrtimer to fire and + * complete the transition to OFF. + */ + loops = port->rs485.flags & SER_RS485_ENABLED ? + port->rs485.delay_rts_after_send : 0; + while (sport->tx_state != OFF && loops--) { + uart_port_unlock_irqrestore(&sport->port, flags); + msleep(1); + uart_port_lock_irqsave(&sport->port, &flags); + } + + if (sport->tx_state != OFF) { + dev_warn(sport->port.dev, "unexpected tx_state %d\n", + sport->tx_state); + /* + * This machine may be busted, but ensure the RTS + * signal is inactive in order not to block other + * devices. + */ + if (port->rs485.flags & SER_RS485_ENABLED) { + ucr2 = imx_uart_readl(sport, UCR2); + if (port->rs485.flags & SER_RS485_RTS_AFTER_SEND) + imx_uart_rts_active(sport, &ucr2); + else + imx_uart_rts_inactive(sport, &ucr2); + imx_uart_writel(sport, ucr2, UCR2); + } + sport->tx_state = OFF; + } + uart_port_unlock_irqrestore(&sport->port, flags); clk_disable_unprepare(sport->clk_per); @@ -1933,7 +1985,7 @@ static int imx_uart_rs485_config(struct uart_port *port, struct ktermios *termio struct serial_rs485 *rs485conf) { struct imx_port *sport = (struct imx_port *)port; - u32 ucr2; + u32 ucr2, ufcr; if (rs485conf->flags & SER_RS485_ENABLED) { /* Enable receiver if low-active RTS signal is requested */ @@ -1953,7 +2005,10 @@ static int imx_uart_rs485_config(struct uart_port *port, struct ktermios *termio /* Make sure Rx is enabled in case Tx is active with Rx disabled */ if (!(rs485conf->flags & SER_RS485_ENABLED) || rs485conf->flags & SER_RS485_RX_DURING_TX) { - imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT); + /* If the receiver trigger is 0, set it to a default value */ + ufcr = imx_uart_readl(sport, UFCR); + if ((ufcr & UFCR_RXTL_MASK) == 0) + imx_uart_setup_ufcr(sport, TXTL_DEFAULT, RXTL_DEFAULT); imx_uart_start_rx(port); } diff --git a/drivers/tty/serial/ma35d1_serial.c b/drivers/tty/serial/ma35d1_serial.c index 19f0a305cc43..3b4206e815fe 100644 --- a/drivers/tty/serial/ma35d1_serial.c +++ b/drivers/tty/serial/ma35d1_serial.c @@ -688,12 +688,13 @@ static int ma35d1serial_probe(struct platform_device *pdev) struct uart_ma35d1_port *up; int ret = 0; - if (pdev->dev.of_node) { - ret = of_alias_get_id(pdev->dev.of_node, "serial"); - if (ret < 0) { - dev_err(&pdev->dev, "failed to get alias/pdev id, errno %d\n", ret); - return ret; - } + if (!pdev->dev.of_node) + return -ENODEV; + + ret = of_alias_get_id(pdev->dev.of_node, "serial"); + if (ret < 0) { + dev_err(&pdev->dev, "failed to get alias/pdev id, errno %d\n", ret); + return ret; } up = &ma35d1serial_ports[ret]; up->port.line = ret; diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index 2bd25afe0d92..69a632fefc41 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -649,15 +649,25 @@ static void qcom_geni_serial_start_tx_dma(struct uart_port *uport) static void qcom_geni_serial_start_tx_fifo(struct uart_port *uport) { + unsigned char c; u32 irq_en; - if (qcom_geni_serial_main_active(uport) || - !qcom_geni_serial_tx_empty(uport)) - return; + /* + * Start a new transfer in case the previous command was cancelled and + * left data in the FIFO which may prevent the watermark interrupt + * from triggering. Note that the stale data is discarded. + */ + if (!qcom_geni_serial_main_active(uport) && + !qcom_geni_serial_tx_empty(uport)) { + if (uart_fifo_out(uport, &c, 1) == 1) { + writel(M_CMD_DONE_EN, uport->membase + SE_GENI_M_IRQ_CLEAR); + qcom_geni_serial_setup_tx(uport, 1); + writel(c, uport->membase + SE_GENI_TX_FIFOn); + } + } irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN); irq_en |= M_TX_FIFO_WATERMARK_EN | M_CMD_DONE_EN; - writel(DEF_TX_WM, uport->membase + SE_GENI_TX_WATERMARK_REG); writel(irq_en, uport->membase + SE_GENI_M_IRQ_EN); } @@ -665,13 +675,17 @@ static void qcom_geni_serial_start_tx_fifo(struct uart_port *uport) static void qcom_geni_serial_stop_tx_fifo(struct uart_port *uport) { u32 irq_en; - struct qcom_geni_serial_port *port = to_dev_port(uport); irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN); irq_en &= ~(M_CMD_DONE_EN | M_TX_FIFO_WATERMARK_EN); writel(0, uport->membase + SE_GENI_TX_WATERMARK_REG); writel(irq_en, uport->membase + SE_GENI_M_IRQ_EN); - /* Possible stop tx is called multiple times. */ +} + +static void qcom_geni_serial_cancel_tx_cmd(struct uart_port *uport) +{ + struct qcom_geni_serial_port *port = to_dev_port(uport); + if (!qcom_geni_serial_main_active(uport)) return; @@ -684,6 +698,8 @@ static void qcom_geni_serial_stop_tx_fifo(struct uart_port *uport) writel(M_CMD_ABORT_EN, uport->membase + SE_GENI_M_IRQ_CLEAR); } writel(M_CMD_CANCEL_EN, uport->membase + SE_GENI_M_IRQ_CLEAR); + + port->tx_remaining = 0; } static void qcom_geni_serial_handle_rx_fifo(struct uart_port *uport, bool drop) @@ -862,7 +878,7 @@ static void qcom_geni_serial_send_chunk_fifo(struct uart_port *uport, memset(buf, 0, sizeof(buf)); tx_bytes = min(remaining, BYTES_PER_FIFO_WORD); - tx_bytes = uart_fifo_out(uport, buf, tx_bytes); + uart_fifo_out(uport, buf, tx_bytes); iowrite32_rep(uport->membase + SE_GENI_TX_FIFOn, buf, 1); @@ -890,13 +906,17 @@ static void qcom_geni_serial_handle_tx_fifo(struct uart_port *uport, else pending = kfifo_len(&tport->xmit_fifo); - /* All data has been transmitted and acknowledged as received */ - if (!pending && !status && done) { + /* All data has been transmitted or command has been cancelled */ + if (!pending && done) { qcom_geni_serial_stop_tx_fifo(uport); goto out_write_wakeup; } - avail = port->tx_fifo_depth - (status & TX_FIFO_WC); + if (active) + avail = port->tx_fifo_depth - (status & TX_FIFO_WC); + else + avail = port->tx_fifo_depth; + avail *= BYTES_PER_FIFO_WORD; chunk = min(avail, pending); @@ -1069,11 +1089,15 @@ static void qcom_geni_serial_shutdown(struct uart_port *uport) { disable_irq(uport->irq); - if (uart_console(uport)) - return; - qcom_geni_serial_stop_tx(uport); qcom_geni_serial_stop_rx(uport); + + qcom_geni_serial_cancel_tx_cmd(uport); +} + +static void qcom_geni_serial_flush_buffer(struct uart_port *uport) +{ + qcom_geni_serial_cancel_tx_cmd(uport); } static int qcom_geni_serial_port_setup(struct uart_port *uport) @@ -1532,6 +1556,7 @@ static const struct uart_ops qcom_geni_console_pops = { .request_port = qcom_geni_serial_request_port, .config_port = qcom_geni_serial_config_port, .shutdown = qcom_geni_serial_shutdown, + .flush_buffer = qcom_geni_serial_flush_buffer, .type = qcom_geni_serial_get_type, .set_mctrl = qcom_geni_serial_set_mctrl, .get_mctrl = qcom_geni_serial_get_mctrl, diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 8944548c30fa..c532416aec22 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -105,16 +105,15 @@ EXPORT_SYMBOL_GPL(ufshcd_mcq_config_mac); * @hba: per adapter instance * @req: pointer to the request to be issued * - * Return: the hardware queue instance on which the request would - * be queued. + * Return: the hardware queue instance on which the request will be or has + * been queued. %NULL if the request has already been freed. */ struct ufs_hw_queue *ufshcd_mcq_req_to_hwq(struct ufs_hba *hba, struct request *req) { - u32 utag = blk_mq_unique_tag(req); - u32 hwq = blk_mq_unique_tag_to_hwq(utag); + struct blk_mq_hw_ctx *hctx = READ_ONCE(req->mq_hctx); - return &hba->uhq[hwq]; + return hctx ? &hba->uhq[hctx->queue_num] : NULL; } /** @@ -515,6 +514,8 @@ int ufshcd_mcq_sq_cleanup(struct ufs_hba *hba, int task_tag) if (!cmd) return -EINVAL; hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(cmd)); + if (!hwq) + return 0; } else { hwq = hba->dev_cmd_queue; } diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 1b65e6ae4137..702fa1996992 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -5193,17 +5193,19 @@ static int ufshcd_change_queue_depth(struct scsi_device *sdev, int depth) } /** - * ufshcd_slave_configure - adjust SCSI device configurations + * ufshcd_device_configure - adjust SCSI device configurations * @sdev: pointer to SCSI device + * @lim: queue limits * * Return: 0 (success). */ -static int ufshcd_slave_configure(struct scsi_device *sdev) +static int ufshcd_device_configure(struct scsi_device *sdev, + struct queue_limits *lim) { struct ufs_hba *hba = shost_priv(sdev->host); struct request_queue *q = sdev->request_queue; - blk_queue_update_dma_pad(q, PRDT_DATA_BYTE_COUNT_PAD - 1); + lim->dma_pad_mask = PRDT_DATA_BYTE_COUNT_PAD - 1; /* * Block runtime-pm until all consumers are added. @@ -6456,6 +6458,8 @@ static bool ufshcd_abort_one(struct request *rq, void *priv) /* Release cmd in MCQ mode if abort succeeds */ if (is_mcq_enabled(hba) && (*ret == 0)) { hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(lrbp->cmd)); + if (!hwq) + return 0; spin_lock_irqsave(&hwq->cq_lock, flags); if (ufshcd_cmd_inflight(lrbp->cmd)) ufshcd_release_scsi_cmd(hba, lrbp); @@ -8908,7 +8912,7 @@ static const struct scsi_host_template ufshcd_driver_template = { .queuecommand = ufshcd_queuecommand, .mq_poll = ufshcd_poll, .slave_alloc = ufshcd_slave_alloc, - .slave_configure = ufshcd_slave_configure, + .device_configure = ufshcd_device_configure, .slave_destroy = ufshcd_slave_destroy, .change_queue_depth = ufshcd_change_queue_depth, .eh_abort_handler = ufshcd_abort, diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 3362af165ef5..880d52c0949d 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -291,6 +291,20 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, if (ifp->desc.bNumEndpoints >= num_ep) goto skip_to_next_endpoint_or_interface_descriptor; + /* Save a copy of the descriptor and use it instead of the original */ + endpoint = &ifp->endpoint[ifp->desc.bNumEndpoints]; + memcpy(&endpoint->desc, d, n); + d = &endpoint->desc; + + /* Clear the reserved bits in bEndpointAddress */ + i = d->bEndpointAddress & + (USB_ENDPOINT_DIR_MASK | USB_ENDPOINT_NUMBER_MASK); + if (i != d->bEndpointAddress) { + dev_notice(ddev, "config %d interface %d altsetting %d has an endpoint descriptor with address 0x%X, changing to 0x%X\n", + cfgno, inum, asnum, d->bEndpointAddress, i); + endpoint->desc.bEndpointAddress = i; + } + /* Check for duplicate endpoint addresses */ if (config_endpoint_is_duplicate(config, inum, asnum, d)) { dev_notice(ddev, "config %d interface %d altsetting %d has a duplicate endpoint with address 0x%X, skipping\n", @@ -308,10 +322,8 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, } } - endpoint = &ifp->endpoint[ifp->desc.bNumEndpoints]; + /* Accept this endpoint */ ++ifp->desc.bNumEndpoints; - - memcpy(&endpoint->desc, d, n); INIT_LIST_HEAD(&endpoint->urb_list); /* diff --git a/drivers/usb/core/of.c b/drivers/usb/core/of.c index f1a499ee482c..763e4122ed5b 100644 --- a/drivers/usb/core/of.c +++ b/drivers/usb/core/of.c @@ -84,9 +84,12 @@ static bool usb_of_has_devices_or_graph(const struct usb_device *hub) if (of_graph_is_present(np)) return true; - for_each_child_of_node(np, child) - if (of_property_present(child, "reg")) + for_each_child_of_node(np, child) { + if (of_property_present(child, "reg")) { + of_node_put(child); return true; + } + } return false; } diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index b4783574b8e6..13171454f959 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -506,6 +506,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1b1c, 0x1b38), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, + /* START BP-850k Printer */ + { USB_DEVICE(0x1bc3, 0x0003), .driver_info = USB_QUIRK_NO_SET_INTF }, + /* MIDI keyboard WORLDE MINI */ { USB_DEVICE(0x1c75, 0x0204), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index 9ef821ca2fc7..052852f80146 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -54,6 +54,10 @@ #define PCI_DEVICE_ID_INTEL_MTL 0x7e7e #define PCI_DEVICE_ID_INTEL_ARLH_PCH 0x777e #define PCI_DEVICE_ID_INTEL_TGL 0x9a15 +#define PCI_DEVICE_ID_INTEL_PTLH 0xe332 +#define PCI_DEVICE_ID_INTEL_PTLH_PCH 0xe37e +#define PCI_DEVICE_ID_INTEL_PTLU 0xe432 +#define PCI_DEVICE_ID_INTEL_PTLU_PCH 0xe47e #define PCI_DEVICE_ID_AMD_MR 0x163a #define PCI_INTEL_BXT_DSM_GUID "732b85d5-b7a7-4a1b-9ba0-4bbd00ffd511" @@ -430,6 +434,10 @@ static const struct pci_device_id dwc3_pci_id_table[] = { { PCI_DEVICE_DATA(INTEL, MTLS, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, ARLH_PCH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, TGL, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, PTLH, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, PTLH_PCH, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, PTLU, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, PTLU_PCH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(AMD, NL_USB, &dwc3_pci_amd_swnode) }, { PCI_DEVICE_DATA(AMD, MR, &dwc3_pci_amd_mr_swnode) }, diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index ce3cfa1f36f5..0e7c1e947c0a 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -115,9 +115,12 @@ static int usb_string_copy(const char *s, char **s_copy) int ret; char *str; char *copy = *s_copy; + ret = strlen(s); if (ret > USB_MAX_STRING_LEN) return -EOVERFLOW; + if (ret < 1) + return -EINVAL; if (copy) { str = copy; diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 37eb37b0affa..0a8cf6c17f82 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1125,10 +1125,20 @@ int xhci_resume(struct xhci_hcd *xhci, pm_message_t msg) xhci_dbg(xhci, "Start the secondary HCD\n"); retval = xhci_run(xhci->shared_hcd); } - + if (retval) + return retval; + /* + * Resume roothubs unconditionally as PORTSC change bits are not + * immediately visible after xHC reset + */ hcd->state = HC_STATE_SUSPENDED; - if (xhci->shared_hcd) + + if (xhci->shared_hcd) { xhci->shared_hcd->state = HC_STATE_SUSPENDED; + usb_hcd_resume_root_hub(xhci->shared_hcd); + } + usb_hcd_resume_root_hub(hcd); + goto done; } @@ -1152,7 +1162,6 @@ int xhci_resume(struct xhci_hcd *xhci, pm_message_t msg) xhci_dbc_resume(xhci); - done: if (retval == 0) { /* * Resume roothubs only if there are pending events. @@ -1178,6 +1187,7 @@ int xhci_resume(struct xhci_hcd *xhci, pm_message_t msg) usb_hcd_resume_root_hub(hcd); } } +done: /* * If system is subject to the Quirk, Compliance Mode Timer needs to * be re-initialized Always after a system resume. Ports are subject diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index 8b0308d84270..85697466b147 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -1737,6 +1737,49 @@ static void mos7840_port_remove(struct usb_serial_port *port) kfree(mos7840_port); } +static int mos7840_suspend(struct usb_serial *serial, pm_message_t message) +{ + struct moschip_port *mos7840_port; + struct usb_serial_port *port; + int i; + + for (i = 0; i < serial->num_ports; ++i) { + port = serial->port[i]; + if (!tty_port_initialized(&port->port)) + continue; + + mos7840_port = usb_get_serial_port_data(port); + + usb_kill_urb(mos7840_port->read_urb); + mos7840_port->read_urb_busy = false; + } + + return 0; +} + +static int mos7840_resume(struct usb_serial *serial) +{ + struct moschip_port *mos7840_port; + struct usb_serial_port *port; + int res; + int i; + + for (i = 0; i < serial->num_ports; ++i) { + port = serial->port[i]; + if (!tty_port_initialized(&port->port)) + continue; + + mos7840_port = usb_get_serial_port_data(port); + + mos7840_port->read_urb_busy = true; + res = usb_submit_urb(mos7840_port->read_urb, GFP_NOIO); + if (res) + mos7840_port->read_urb_busy = false; + } + + return 0; +} + static struct usb_serial_driver moschip7840_4port_device = { .driver = { .owner = THIS_MODULE, @@ -1764,6 +1807,8 @@ static struct usb_serial_driver moschip7840_4port_device = { .port_probe = mos7840_port_probe, .port_remove = mos7840_port_remove, .read_bulk_callback = mos7840_bulk_in_callback, + .suspend = mos7840_suspend, + .resume = mos7840_resume, }; static struct usb_serial_driver * const serial_drivers[] = { diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 8a5846d4adf6..311040f9b935 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1425,6 +1425,10 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1901, 0xff), /* Telit LN940 (MBIM) */ .driver_info = NCTRL(0) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x3000, 0xff), /* Telit FN912 */ + .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x3001, 0xff), /* Telit FN912 */ + .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x7010, 0xff), /* Telit LE910-S1 (RNDIS) */ .driver_info = NCTRL(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x7011, 0xff), /* Telit LE910-S1 (ECM) */ @@ -1433,6 +1437,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x701b, 0xff), /* Telit LE910R1 (ECM) */ .driver_info = NCTRL(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x9000, 0xff), /* Telit generic core-dump device */ + .driver_info = NCTRL(0) }, { USB_DEVICE(TELIT_VENDOR_ID, 0x9010), /* Telit SBL FN980 flashing device */ .driver_info = NCTRL(0) | ZLP }, { USB_DEVICE(TELIT_VENDOR_ID, 0x9200), /* Telit LE910S1 flashing device */ @@ -2224,6 +2230,10 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_7106_2COM, 0x02, 0x02, 0x01) }, { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM2, 0xff, 0x02, 0x01) }, { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, MEDIATEK_PRODUCT_DC_4COM2, 0xff, 0x00, 0x00) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7126, 0xff, 0x00, 0x00), + .driver_info = NCTRL(2) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEDIATEK_VENDOR_ID, 0x7127, 0xff, 0x00, 0x00), + .driver_info = NCTRL(2) | NCTRL(3) | NCTRL(4) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MPL200), .driver_info = RSVD(1) | RSVD(4) }, @@ -2284,6 +2294,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe0f0, 0xff), /* Foxconn T99W373 MBIM */ .driver_info = RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(0x0489, 0xe145, 0xff), /* Foxconn T99W651 RNDIS */ + .driver_info = RSVD(5) | RSVD(6) }, { USB_DEVICE(0x1508, 0x1001), /* Fibocom NL668 (IOT version) */ .driver_info = RSVD(4) | RSVD(5) | RSVD(6) }, { USB_DEVICE(0x1782, 0x4d10) }, /* Fibocom L610 (AT mode) */ @@ -2321,6 +2333,32 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(4) }, { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0115, 0xff), /* Rolling RW135-GL (laptop MBIM) */ .driver_info = RSVD(5) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0802, 0xff), /* Rolling RW350-GL (laptop MBIM) */ + .driver_info = RSVD(5) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0100, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for Global */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0100, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0100, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0101, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WRD for Global SKU */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0101, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0101, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0106, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WRD for China SKU */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0106, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0106, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0111, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for SA */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0111, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0111, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0112, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for EU */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0112, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0112, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0113, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for NA */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0113, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0113, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0115, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for China EDU */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0115, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0115, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for Golbal EDU */ + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0116, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(OPPO_VENDOR_ID, OPPO_PRODUCT_R11, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 987c7921affa..ba0ce0075b2f 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1260,7 +1260,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( struct vfio_pci_hot_reset_info hdr; struct vfio_pci_fill_info fill = {}; bool slot = false; - int ret, count; + int ret, count = 0; if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 15bb7989c387..3acf5e050072 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -512,7 +512,7 @@ static int afs_iget5_set_root(struct inode *inode, void *opaque) struct afs_vnode *vnode = AFS_FS_I(inode); vnode->volume = as->volume; - vnode->fid.vid = as->volume->vid, + vnode->fid.vid = as->volume->vid; vnode->fid.vnode = 1; vnode->fid.unique = 1; inode->i_ino = 1; @@ -545,7 +545,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) BUG_ON(!(inode->i_state & I_NEW)); vnode = AFS_FS_I(inode); - vnode->cb_v_check = atomic_read(&as->volume->cb_v_break), + vnode->cb_v_check = atomic_read(&as->volume->cb_v_break); afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); @@ -1516,7 +1516,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) iocb_put(iocb); } -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; @@ -1542,7 +1542,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) } else req->ki_ioprio = get_current_ioprio(); - ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; @@ -1594,7 +1594,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; @@ -1621,7 +1621,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; diff --git a/fs/attr.c b/fs/attr.c index 960a310581eb..825007d5cda4 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -17,8 +17,6 @@ #include <linux/filelock.h> #include <linux/security.h> -#include "internal.h" - /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed diff --git a/fs/autofs/init.c b/fs/autofs/init.c index b5e4dfa04ed0..1d644a35ffa0 100644 --- a/fs/autofs/init.c +++ b/fs/autofs/init.c @@ -38,4 +38,5 @@ static void __exit exit_autofs_fs(void) module_init(init_autofs_fs) module_exit(exit_autofs_fs) +MODULE_DESCRIPTION("Kernel automounter support"); MODULE_LICENSE("GPL"); diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 1f5db6863663..cf792d4de4f1 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -126,7 +126,7 @@ enum { const struct fs_parameter_spec autofs_param_specs[] = { fsparam_flag ("direct", Opt_direct), fsparam_fd ("fd", Opt_fd), - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_flag ("ignore", Opt_ignore), fsparam_flag ("indirect", Opt_indirect), fsparam_u32 ("maxproto", Opt_maxproto), @@ -134,7 +134,7 @@ const struct fs_parameter_spec autofs_param_specs[] = { fsparam_flag ("offset", Opt_offset), fsparam_u32 ("pgrp", Opt_pgrp), fsparam_flag ("strictexpire", Opt_strictexpire), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), {} }; @@ -193,8 +193,6 @@ static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param) struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; - kuid_t uid; - kgid_t gid; int opt; opt = fs_parse(fc, autofs_param_specs, param, &result); @@ -205,16 +203,10 @@ static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_fd: return autofs_parse_fd(fc, sbi, param, &result); case Opt_uid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - return invalfc(fc, "Invalid uid"); - ctx->uid = uid; + ctx->uid = result.uid; break; case Opt_gid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) - return invalfc(fc, "Invalid gid"); - ctx->gid = gid; + ctx->gid = result.gid; break; case Opt_pgrp: ctx->pgrp = result.uint_32; diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 1de9fac3bcf4..658f11aebda1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -3,6 +3,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" @@ -1553,13 +1554,13 @@ err: } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct btree_iter *alloc_iter) + struct btree_iter *alloc_iter, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter lru_iter; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k, lru_k; + struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret; @@ -1573,6 +1574,14 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); + if (a->fragmentation_lru) { + ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + a->fragmentation_lru, + alloc_k, last_flushed); + if (ret) + return ret; + } + if (a->data_type != BCH_DATA_cached) return 0; @@ -1597,41 +1606,30 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = &a_mut->v; } - lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]), 0); - ret = bkey_err(lru_k); + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + alloc_k, last_flushed); if (ret) - return ret; - - if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, - alloc_key_to_missing_lru_entry, - "missing lru entry\n" - " %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = bch2_lru_set(trans, - alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]); - if (ret) - goto err; - } + goto err; err: fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 9d3d64746a5b..27d97c22ae27 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1703,6 +1703,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; + printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 24); percpu_down_read(&c->mark_lock); @@ -1736,6 +1737,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; + printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 12); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 4321f9fb73bd..6d8b1bc90be0 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -434,13 +434,6 @@ int bch2_check_btree_backpointers(struct bch_fs *c) return ret; } -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - struct extents_to_bp_state { struct bpos bucket_start; struct bpos bucket_end; @@ -536,11 +529,8 @@ static int check_bp_exists(struct btree_trans *trans, struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; - struct bkey_buf tmp; int ret = 0; - bch2_bkey_buf_init(&tmp); - struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); if (!ca) { prt_str(&buf, "extent for nonexistent device:bucket "); @@ -565,22 +555,9 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - bch2_bkey_buf_reassemble(&tmp, c, orig_k); - - if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { - if (bp.level) { - bch2_trans_unlock(trans); - bch2_btree_interior_updates_flush(c); - } - - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); + if (ret) + goto err; goto check_existing_bp; } @@ -589,7 +566,6 @@ err: fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); - bch2_bkey_buf_exit(&tmp, c); bch2_dev_put(ca); printbuf_exit(&buf); return ret; @@ -794,6 +770,8 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, !((1U << btree) & btree_interior_mask)) continue; + bch2_trans_begin(trans); + __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, 0, depth, BTREE_ITER_prefetch, b, ret) { @@ -905,7 +883,7 @@ static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, struct bkey_s_c_backpointer bp, - struct bpos *last_flushed_pos) + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -925,20 +903,18 @@ static int check_one_backpointer(struct btree_trans *trans, if (ret) return ret; - if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { - *last_flushed_pos = bp.k->p; - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + if (!k.k) { + ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); + if (ret) + goto out; - if (fsck_err_on(!k.k, c, - backpointer_to_missing_ptr, - "backpointer for missing %s\n %s", - bp.v->level ? "btree node" : "extent", - (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); - goto out; + if (fsck_err(c, backpointer_to_missing_ptr, + "backpointer for missing %s\n %s", + bp.v->level ? "btree node" : "extent", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + goto out; + } } out: fsck_err: @@ -951,14 +927,20 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct bpos last_flushed_pos = SPOS_MAX; + struct bkey_buf last_flushed; - return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), - &last_flushed_pos)); + &last_flushed)); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + return ret; } int bch2_check_backpointers_to_extents(struct bch_fs *c) diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 94a1d1982fa8..587d7318a2e8 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -660,8 +660,9 @@ int bch2_bkey_format_invalid(struct bch_fs *c, bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + unsigned packed_bits = min(64, f->bits_per_field[i]); + u64 packed_max = packed_bits + ? ~((~0ULL << 1) << (packed_bits - 1)) : 0; prt_printf(err, "field %u too large: %llu + %llu > %llu", diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index fcd43915df07..936357149cf0 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -194,6 +194,13 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) return bkey_gt(l, r) ? l : r; } +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0e477a926579..a0deb8266011 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -641,16 +641,30 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in target_depth = 0; /* root */ - mutex_lock(&c->btree_root_lock); - struct btree *b = bch2_btree_id_root(c, btree)->b; - if (!btree_node_fake(b)) { + do { +retry_root: + bch2_trans_begin(trans); + + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, + 0, bch2_btree_id_root(c, btree)->b->c.level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err_root; + + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX)); - ret = lockrestart_do(trans, - bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, - NULL, NULL, bkey_i_to_s_c(&b->key), initial)); + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); level = b->c.level; - } - mutex_unlock(&c->btree_root_lock); +err_root: + bch2_trans_iter_exit(trans, &iter); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); if (ret) return ret; @@ -903,6 +917,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true); percpu_up_read(&c->mark_lock); + gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca); + if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" @@ -916,23 +932,19 @@ static int bch2_alloc_write_key(struct btree_trans *trans, #define copy_bucket_field(_errtype, _f) \ if (fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", \ + ": got %llu, should be %llu", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ bch2_data_type_str(gc.data_type), \ - new._f, gc._f)) \ + (u64) new._f, (u64) gc._f)) \ new._f = gc._f; \ - copy_bucket_field(alloc_key_gen_wrong, - gen); - copy_bucket_field(alloc_key_dirty_sectors_wrong, - dirty_sectors); - copy_bucket_field(alloc_key_cached_sectors_wrong, - cached_sectors); - copy_bucket_field(alloc_key_stripe_wrong, - stripe); - copy_bucket_field(alloc_key_stripe_redundancy_wrong, - stripe_redundancy); + copy_bucket_field(alloc_key_gen_wrong, gen); + copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); + copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); + copy_bucket_field(alloc_key_stripe_wrong, stripe); + copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); + copy_bucket_field(alloc_key_fragmentation_lru_wrong, fragmentation_lru); #undef copy_bucket_field if (!bch2_alloc_v4_cmp(*old, new)) @@ -946,7 +958,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, a->v = new; /* - * The trigger normally makes sure this is set, but we're not running + * The trigger normally makes sure these are set, but we're not running * triggers: */ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 0ed9e6574fcd..19352a08ea20 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -996,7 +996,7 @@ retry_all: bch2_trans_unlock(trans); cond_resched(); - trans->locked = true; + trans_set_locked(trans); if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -3089,7 +3089,8 @@ u32 bch2_trans_begin(struct btree_trans *trans) bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; - trans->locked = true; + + trans_set_locked(trans); if (trans->restarted) { bch2_btree_path_traverse_all(trans); @@ -3159,7 +3160,6 @@ got_trans: trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; trans->locking_wait.task = current; - trans->locked = true; trans->journal_replay_not_finished = unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) && atomic_inc_not_zero(&c->journal_keys.ref); @@ -3193,6 +3193,7 @@ got_trans: trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; + trans_set_locked(trans); closure_init_stack_release(&trans->ref); return trans; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index d66fff22109a..c51826fd557f 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -231,7 +231,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) prt_newline(&buf); } - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); printbuf_exit(&buf); BUG(); } @@ -792,7 +792,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) return bch2_trans_relock_fail(trans, path, &f, trace); } - trans->locked = true; + trans_set_locked(trans); out: bch2_trans_verify_locks(trans); return 0; @@ -812,16 +812,14 @@ void bch2_trans_unlock_noassert(struct btree_trans *trans) { __bch2_trans_unlock(trans); - trans->locked = false; - trans->last_unlock_ip = _RET_IP_; + trans_set_unlocked(trans); } void bch2_trans_unlock(struct btree_trans *trans) { __bch2_trans_unlock(trans); - trans->locked = false; - trans->last_unlock_ip = _RET_IP_; + trans_set_unlocked(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 7f41545b9147..75a6274c7d27 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -193,6 +193,28 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); /* lock: */ +static inline void trans_set_locked(struct btree_trans *trans) +{ + if (!trans->locked) { + trans->locked = true; + trans->last_unlock_ip = 0; + + trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0; + current->flags |= PF_MEMALLOC_NOFS; + } +} + +static inline void trans_set_unlocked(struct btree_trans *trans) +{ + if (trans->locked) { + trans->locked = false; + trans->last_unlock_ip = _RET_IP_; + + if (!trans->pf_memalloc_nofs) + current->flags &= ~PF_MEMALLOC_NOFS; + } +} + static inline int __btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 87f485e9c552..48cb1a7d31c5 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -484,6 +484,7 @@ struct btree_trans { bool lock_may_not_fail:1; bool srcu_held:1; bool locked:1; + bool pf_memalloc_nofs:1; bool write_locked:1; bool used_mempool:1; bool in_traverse_all:1; diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 75c8a196b3f6..d0e92d948002 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_locking.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "error.h" +#include "extents.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -492,6 +494,41 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) return ret; } +/** + * In check and repair code, when checking references to write buffer btrees we + * need to issue a flush before we have a definitive error: this issues a flush + * if this is a key we haven't yet checked. + */ +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_buf tmp; + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + bch2_bkey_buf_reassemble(&tmp, c, referring_k); + + if (bkey_is_btree_ptr(referring_k.k)) { + bch2_trans_unlock(trans); + bch2_btree_interior_updates_flush(c); + } + + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + + bch2_bkey_buf_copy(last_flushed, c, tmp.k); + ret = -BCH_ERR_transaction_restart_write_buffer_flush; + } +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index eebcd2b15249..dd5e64218b50 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -23,6 +23,9 @@ int bch2_btree_write_buffer_flush_sync(struct btree_trans *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); +struct bkey_buf; +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); + struct journal_keys_to_wb { struct btree_write_buffer_keys *wb; size_t room; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 743d57eba760..314ee3e0187f 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -805,7 +805,7 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - *bucket_gen(ca, bucket_nr), + bucket_gen_get(ca, bucket_nr), bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 80ee0be9793e..8ad4be73860c 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -116,6 +116,14 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) return gens->b + b; } +static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b) +{ + rcu_read_lock(); + u8 gen = *bucket_gen(ca, b); + rcu_read_unlock(); + return gen; +} + static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, const struct bch_extent_ptr *ptr) { diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 363644451106..0f40b585ce2b 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -132,14 +132,9 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, { struct io_timer *ret = NULL; - spin_lock(&clock->timer_lock); - if (clock->timers.used && time_after_eq(now, clock->timers.data[0]->expire)) heap_pop(&clock->timers, ret, io_timer_cmp, NULL); - - spin_unlock(&clock->timer_lock); - return ret; } @@ -148,8 +143,10 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) struct io_timer *timer; unsigned long now = atomic64_add_return(sectors, &clock->now); + spin_lock(&clock->timer_lock); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); + spin_unlock(&clock->timer_lock); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 1a0072eef109..0087b8555ead 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -5,7 +5,9 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "compress.h" #include "data_update.h" +#include "disk_groups.h" #include "ec.h" #include "error.h" #include "extents.h" @@ -454,6 +456,38 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, } } +void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target:\t"); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression:\t"); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); + prt_newline(out); + + prt_str(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); +} + +void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -643,6 +677,16 @@ int bch2_data_update_init(struct btree_trans *trans, if (!(durability_have + durability_removing)) m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + if (!m->op.nr_replicas) { + struct printbuf buf = PRINTBUF; + + bch2_data_update_to_text(&buf, m); + WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_data_update_done; + goto done; + } + m->op.nr_replicas_required = m->op.nr_replicas; if (reserve_sectors) { diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 991095bbd469..8d36365bdea8 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -17,6 +17,9 @@ struct data_update_opts { unsigned write_flags; }; +void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_opts *, struct data_update_opts *); + struct data_update { /* extent being updated: */ enum btree_id btree_id; @@ -27,6 +30,8 @@ struct data_update { struct bch_write_op op; }; +void bch2_data_update_to_text(struct printbuf *, struct data_update *); + int bch2_data_update_index_update(struct bch_write_op *); void bch2_data_update_read_done(struct data_update *, diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index f0d4727c4dc2..ebabab171fe5 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -610,7 +610,7 @@ restart: list_sort(&c->btree_trans_list, list_ptr_order_cmp); list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans < i->iter) + if ((ulong) trans <= i->iter) continue; i->iter = (ulong) trans; @@ -832,16 +832,16 @@ static const struct file_operations btree_transaction_stats_op = { static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans *trans; - pid_t iter = 0; + ulong iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans <= iter) continue; - iter = task->pid; + iter = (ulong) trans; if (!closure_get_not_zero(&trans->ref)) continue; diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 24840aee335c..795f4fc0bab1 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -48,7 +48,7 @@ static inline unsigned eytzinger1_right_child(unsigned i) static inline unsigned eytzinger1_first(unsigned size) { - return rounddown_pow_of_two(size); + return size ? rounddown_pow_of_two(size) : 0; } static inline unsigned eytzinger1_last(unsigned size) @@ -101,7 +101,9 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) static inline unsigned eytzinger1_extra(unsigned size) { - return (size + 1 - rounddown_pow_of_two(size)) << 1; + return size + ? (size + 1 - rounddown_pow_of_two(size)) << 1 + : 0; } static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f9c9a95d7d4c..0c7d1bc0548a 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -194,6 +194,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino * discard_new_inode() expects it to be set... */ inode->v.i_flags |= I_NEW; + /* + * We don't want bch2_evict_inode() to delete the inode on disk, + * we just raced and had another inode in cache. Normally new + * inodes don't have nlink == 0 - except tmpfiles do... + */ + set_nlink(&inode->v, 1); discard_new_inode(&inode->v); inode = old; } else { @@ -238,7 +244,6 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) inode->ei_flags = 0; mutex_init(&inode->ei_quota_lock); memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - inode->v.i_state = 0; if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { kmem_cache_free(bch2_inode_cache, inode); @@ -2026,6 +2031,8 @@ err_put_super: __bch2_fs_stop(c); deactivate_locked_super(sb); err: + if (ret) + pr_err("error: %s", bch2_err_str(ret)); /* * On an inconsistency error in recovery we might see an -EROFS derived * errorcode (from the journal), but we don't want to return that to diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 4ec979b4b23e..4583c9386e8c 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -125,7 +125,7 @@ err_noprint: bch2_bkey_buf_exit(&old, c); if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock(trans); + bch2_trans_unlock_long(trans); closure_sync(&cl); } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index c97fa7002b06..ebf39ef72fb2 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -389,7 +389,6 @@ retry: bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(trans); if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, @@ -1004,6 +1003,9 @@ get_bio: rbio->promote = promote; INIT_WORK(&rbio->work, NULL); + if (flags & BCH_READ_NODECODE) + orig->pick = pick; + rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 13669dd0e375..10b19791ec98 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1095,7 +1095,7 @@ unlock: return ret; } -int bch2_dev_journal_alloc(struct bch_dev *ca) +int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { unsigned nr; int ret; @@ -1117,7 +1117,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); + ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL); err: bch_err_fn(ca, ret); return ret; @@ -1129,7 +1129,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c) if (ca->journal.nr) continue; - int ret = bch2_dev_journal_alloc(ca); + int ret = bch2_dev_journal_alloc(ca, true); if (ret) { percpu_ref_put(&ca->io_ref); return ret; @@ -1184,9 +1184,11 @@ void bch2_fs_journal_stop(struct journal *j) journal_quiesce(j); cancel_delayed_work_sync(&j->write_work); - BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_replay_done, &j->flags) && - j->last_empty_seq != journal_cur_seq(j)); + WARN(!bch2_journal_error(j) && + test_bit(JOURNAL_replay_done, &j->flags) && + j->last_empty_seq != journal_cur_seq(j), + "journal shutdown error: cur seq %llu but last empty seq %llu", + journal_cur_seq(j), j->last_empty_seq); if (!bch2_journal_error(j)) clear_bit(JOURNAL_running, &j->flags); @@ -1418,8 +1420,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) unsigned long now = jiffies; u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 28); + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 28); out->atomic++; rcu_read_lock(); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index fd1f7cdaa8bc..bc6b9c39dcb4 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -433,7 +433,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); -int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index db24ce21b2ac..2326e2cb9cd2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -415,6 +415,8 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, flags|BCH_VALIDATE_journal); if (ret == FSCK_DELETED_KEY) continue; + else if (ret) + return ret; k = bkey_next(k); } @@ -1762,11 +1764,13 @@ static CLOSURE_CALLBACK(journal_write_preflush) if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); - closure_wait(&j->async_wait, cl); + if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); + continue_at(cl, journal_write_preflush, j->wq); + return; + } spin_unlock(&j->lock); - - continue_at(cl, journal_write_preflush, j->wq); - return; } if (w->separate_flush) { diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index a40d116224ed..b12894ef44f3 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -77,6 +77,45 @@ static const char * const bch2_lru_types[] = { NULL }; +int bch2_lru_check_set(struct btree_trans *trans, + u16 lru_id, u64 time, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter lru_iter; + struct bkey_s_c lru_k = + bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(lru_id, + bucket_to_u64(referring_k.k->p), + time), 0); + int ret = bkey_err(lru_k); + if (ret) + return ret; + + if (lru_k.k->type != KEY_TYPE_set) { + ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); + if (ret) + goto err; + + if (fsck_err(c, alloc_key_to_missing_lru_entry, + "missing %s lru entry\n" + " %s", + bch2_lru_types[lru_type(lru_k)], + (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + if (ret) + goto err; + } + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf); + return ret; +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index bd71ba77de07..ed75bcf59d47 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -61,6 +61,9 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +struct bkey_buf; +int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); + int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 6e477fadaa2a..e714e3bd5bbb 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -36,31 +36,6 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - - prt_str(out, "kill ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - - prt_str(out, "target:\t"); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - - prt_str(out, "compression:\t"); - bch2_compression_opt_to_text(out, background_compression(*io_opts)); - prt_newline(out); - - prt_str(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); -} - static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d6f35a99c429..d54121ec093f 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -286,7 +286,8 @@ enum bch_fsck_flags { x(accounting_mismatch, 272, 0) \ x(accounting_replicas_not_marked, 273, 0) \ x(invalid_btree_id, 274, 0) \ - x(alloc_key_io_time_bad, 275, 0) + x(alloc_key_io_time_bad, 275, 0) \ + x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index fb906467201e..da735608d47c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -563,8 +563,11 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); - EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved)); - free_percpu(c->online_reserved); + if (c->online_reserved) { + u64 v = percpu_u64_get(c->online_reserved); + WARN(v, "online_reserved not 0 at shutdown: %lli", v); + free_percpu(c->online_reserved); + } darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); @@ -1769,7 +1772,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, true); bch_err_msg(c, ret, "allocating journal"); if (ret) goto err; @@ -1929,7 +1932,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) } if (!ca->journal.nr) { - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, false); bch_err_msg(ca, ret, "allocating journal"); if (ret) goto err; diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index de331dec2a99..4ec7e44d6e36 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -252,8 +252,10 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); } -void bch2_print_string_as_lines(const char *prefix, const char *lines) +static void __bch2_print_string_as_lines(const char *prefix, const char *lines, + bool nonblocking) { + bool locked = false; const char *p; if (!lines) { @@ -261,7 +263,13 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) return; } - console_lock(); + if (!nonblocking) { + console_lock(); + locked = true; + } else { + locked = console_trylock(); + } + while (1) { p = strchrnul(lines, '\n'); printk("%s%.*s\n", prefix, (int) (p - lines), lines); @@ -269,7 +277,18 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) break; lines = p + 1; } - console_unlock(); + if (locked) + console_unlock(); +} + +void bch2_print_string_as_lines(const char *prefix, const char *lines) +{ + return __bch2_print_string_as_lines(prefix, lines, false); +} + +void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) +{ + return __bch2_print_string_as_lines(prefix, lines, true); } int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 5d2c470a49ac..5b0533ec4c7e 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -315,6 +315,7 @@ void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); +void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index d76f406d3b2e..f92f108840f5 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -475,6 +475,7 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio) befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; char *link = folio_address(folio); + int err = -EIO; if (len == 0 || len > PAGE_SIZE) { befs_error(sb, "Long symlink with illegal length"); @@ -487,13 +488,10 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio) goto fail; } link[len - 1] = '\0'; - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; + err = 0; fail: - folio_set_error(folio); - folio_unlock(folio); - return -EIO; + folio_end_read(folio, err == 0); + return err; } /* diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index a43897b03ce9..6fdec541f8bf 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1216,7 +1216,6 @@ out_free_interp: } reloc_func_desc = interp_load_addr; - allow_write_access(interpreter); fput(interpreter); kfree(interp_elf_ex); @@ -1308,7 +1307,6 @@ out_free_dentry: kfree(interp_elf_ex); kfree(interp_elf_phdata); out_free_file: - allow_write_access(interpreter); if (interpreter) fput(interpreter); out_free_ph: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index b799701454a9..28a3439f163a 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -394,7 +394,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) goto error; } - allow_write_access(interpreter); fput(interpreter); interpreter = NULL; } @@ -466,10 +465,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) retval = 0; error: - if (interpreter) { - allow_write_access(interpreter); + if (interpreter) fput(interpreter); - } kfree(interpreter_name); kfree(exec_params.phdrs); kfree(exec_params.loadmap); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 68fa225f89e5..31660d8cc2c6 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -247,13 +247,10 @@ static int load_misc_binary(struct linux_binprm *bprm) if (retval < 0) goto ret; - if (fmt->flags & MISC_FMT_OPEN_FILE) { + if (fmt->flags & MISC_FMT_OPEN_FILE) interp_file = file_clone_open(fmt->interp_file); - if (!IS_ERR(interp_file)) - deny_write_access(interp_file); - } else { + else interp_file = open_exec(fmt->interpreter); - } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) goto ret; @@ -1086,4 +1083,5 @@ static void __exit exit_misc_binfmt(void) core_initcall(init_misc_binfmt); module_exit(exit_misc_binfmt); +MODULE_DESCRIPTION("Kernel support for miscellaneous binaries"); MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 1b6625e95958..637daf6e4d45 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -155,4 +155,5 @@ static void __exit exit_script_binfmt(void) core_initcall(init_script_binfmt); module_exit(exit_script_binfmt); +MODULE_DESCRIPTION("Kernel support for scripts starting with #!"); MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1a66be33bb04..60066822b532 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1924,8 +1924,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) next: if (ret) { /* Refcount held by the reclaim_bgs list after splice. */ - btrfs_get_block_group(bg); - list_add_tail(&bg->bg_list, &retry_list); + spin_lock(&fs_info->unused_bgs_lock); + /* + * This block group might be added to the unused list + * during the above process. Move it back to the + * reclaim list otherwise. + */ + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, &retry_list); + } + spin_unlock(&fs_info->unused_bgs_lock); } btrfs_put_block_group(bg); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 38cdb8875e8e..cabb558dbdaa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + spin_lock_init(&fs_info->extent_map_shrinker_lock); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f688fab55251..958155cc43a8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3553,7 +3553,7 @@ err: for (int i = 0; i < num_folios; i++) { if (eb->folios[i]) { detach_extent_buffer_folio(eb, eb->folios[i]); - __folio_put(eb->folios[i]); + folio_put(eb->folios[i]); } } __free_extent_buffer(eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 744e8952abb0..b4c9a6aa118c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1028,7 +1028,14 @@ out_free_pre: return ret; } -static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan) +struct btrfs_em_shrink_ctx { + long nr_to_scan; + long scanned; + u64 last_ino; + u64 last_root; +}; + +static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) { const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); struct extent_map_tree *tree = &inode->extent_tree; @@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - write_lock(&tree->lock); + /* + * We want to be fast because we can be called from any path trying to + * allocate memory, so if the lock is busy we don't want to spend time + * waiting for it - either some task is about to do IO for the inode or + * we may have another task shrinking extent maps, here in this code, so + * skip this inode. + */ + if (!write_trylock(&tree->lock)) { + up_read(&inode->i_mmap_lock); + return 0; + } + node = rb_first_cached(&tree->map); while (node) { struct extent_map *em; em = rb_entry(node, struct extent_map, rb_node); node = rb_next(node); - (*scanned)++; + ctx->scanned++; if (em->flags & EXTENT_FLAG_PINNED) goto next; @@ -1085,16 +1103,18 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t free_extent_map(em); nr_dropped++; next: - if (*scanned >= nr_to_scan) + if (ctx->scanned >= ctx->nr_to_scan) break; /* - * Restart if we had to reschedule, and any extent maps that were - * pinned before may have become unpinned after we released the - * lock and took it again. + * Stop if we need to reschedule or there's contention on the + * lock. This is to avoid slowing other tasks trying to take the + * lock and because the shrinker might be called during a memory + * allocation path and we want to avoid taking a very long time + * and slowing down all sorts of tasks. */ - if (cond_resched_rwlock_write(&tree->lock)) - node = rb_first_cached(&tree->map); + if (need_resched() || rwlock_needbreak(&tree->lock)) + break; } write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); @@ -1102,25 +1122,30 @@ next: return nr_dropped; } -static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan) +static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; - u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1; + u64 min_ino = ctx->last_ino + 1; inode = btrfs_find_first_inode(root, min_ino); while (inode) { - nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan); + nr_dropped += btrfs_scan_inode(inode, ctx); min_ino = btrfs_ino(inode) + 1; - fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode); - iput(&inode->vfs_inode); + ctx->last_ino = btrfs_ino(inode); + btrfs_add_delayed_iput(inode); - if (*scanned >= nr_to_scan) + if (ctx->scanned >= ctx->nr_to_scan) + break; + + /* + * We may be called from memory allocation paths, so we don't + * want to take too much time and slowdown tasks. + */ + if (need_resched()) break; - cond_resched(); inode = btrfs_find_first_inode(root, min_ino); } @@ -1132,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s * inode if there is one or we will find out this was the last * one and move to the next root. */ - fs_info->extent_map_shrinker_last_root = btrfs_root_id(root); + ctx->last_root = btrfs_root_id(root); } else { /* * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * that when processing the next root we start from its first inode. */ - fs_info->extent_map_shrinker_last_ino = 0; - fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1; + ctx->last_ino = 0; + ctx->last_root = btrfs_root_id(root) + 1; } return nr_dropped; @@ -1147,19 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) { - const u64 start_root_id = fs_info->extent_map_shrinker_last_root; - u64 next_root_id = start_root_id; + struct btrfs_em_shrink_ctx ctx; + u64 start_root_id; + u64 next_root_id; bool cycled = false; long nr_dropped = 0; - long scanned = 0; + + ctx.scanned = 0; + ctx.nr_to_scan = nr_to_scan; + + /* + * In case we have multiple tasks running this shrinker, make the next + * one start from the next inode in case it starts before we finish. + */ + spin_lock(&fs_info->extent_map_shrinker_lock); + ctx.last_ino = fs_info->extent_map_shrinker_last_ino; + fs_info->extent_map_shrinker_last_ino++; + ctx.last_root = fs_info->extent_map_shrinker_last_root; + spin_unlock(&fs_info->extent_map_shrinker_lock); + + start_root_id = ctx.last_root; + next_root_id = ctx.last_root; if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr); + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, + nr, ctx.last_root, + ctx.last_ino); } - while (scanned < nr_to_scan) { + /* + * We may be called from memory allocation paths, so we don't want to + * take too much time and slowdown tasks, so stop if we need reschedule. + */ + while (ctx.scanned < ctx.nr_to_scan && !need_resched()) { struct btrfs_root *root; unsigned long count; @@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) spin_unlock(&fs_info->fs_roots_radix_lock); if (start_root_id > 0 && !cycled) { next_root_id = 0; - fs_info->extent_map_shrinker_last_root = 0; - fs_info->extent_map_shrinker_last_ino = 0; + ctx.last_root = 0; + ctx.last_ino = 0; cycled = true; continue; } @@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) continue; if (is_fstree(btrfs_root_id(root))) - nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan); + nr_dropped += btrfs_scan_root(root, &ctx); btrfs_put_root(root); } + /* + * In case of multiple tasks running this extent map shrinking code this + * isn't perfect but it's simple and silences things like KCSAN. It's + * not possible to know which task made more progress because we can + * cycle back to the first root and first inode if it's not the first + * time the shrinker ran, see the above logic. Also a task that started + * later may finish ealier than another task and made less progress. So + * make this simple and update to the progress of the last task that + * finished, with the occasional possiblity of having two consecutive + * runs of the shrinker process the same inodes. + */ + spin_lock(&fs_info->extent_map_shrinker_lock); + fs_info->extent_map_shrinker_last_ino = ctx.last_ino; + fs_info->extent_map_shrinker_last_root = ctx.last_root; + spin_unlock(&fs_info->extent_map_shrinker_lock); + if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, + nr, ctx.last_root, + ctx.last_ino); } return nr_dropped; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 89f0650631cd..833dc3fe0a38 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -630,6 +630,7 @@ struct btrfs_fs_info { s32 delalloc_batch; struct percpu_counter evictable_extent_maps; + spinlock_t extent_map_shrinker_lock; u64 extent_map_shrinker_last_root; u64 extent_map_shrinker_last_ino; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 753db965f7c0..d62c96f00ff8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5587,7 +5587,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, args.ino = ino; args.root = root; - inode = iget5_locked(s, hashval, btrfs_find_actor, + inode = iget5_locked_rcu(s, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; @@ -10385,7 +10385,7 @@ out_unlock: out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) - __folio_put(folios[i]); + folio_put(folios[i]); } kvfree(folios); out: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index efd5d6e9589e..6ad524b894fc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_iov; init_sync_kiocb(&kiocb, file); - ret = kiocb_set_rw_flags(&kiocb, 0); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) goto out_iov; kiocb.ki_pos = pos; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index bf0f81d59b6b..39a15cca58ca 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3062,8 +3062,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_inherit *inherit, size_t size) { - if (!btrfs_qgroup_enabled(fs_info)) - return 0; if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) return -EOPNOTSUPP; if (size < sizeof(*inherit) || size > PAGE_SIZE) @@ -3085,6 +3083,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, return -EINVAL; /* + * Skip the inherit source qgroups check if qgroup is not enabled. + * Qgroup can still be later enabled causing problems, but in that case + * btrfs_qgroup_inherit() would just ignore those invalid ones. + */ + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + + /* * Now check all the remaining qgroups, they should all: * * - Exist diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index cf531255ab76..9522a8b79d22 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; - int type, ret; + int type; + int ret = 0; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); @@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, key->objectid, key->offset); break; case BTRFS_EXTENT_OWNER_REF_KEY: - WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) { + btrfs_err(fs_info, + "found extent owner ref without simple quotas enabled"); + ret = -EINVAL; + } break; default: btrfs_err(fs_info, "invalid key type in iref"); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d620323d08ea..ae8c56442549 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -373,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, * "optimal" chunk size based on the fs size. However when we actually * allocate the chunk we will strip this down further, making it no more * than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= + * data_sinfo->chunk_size) as it is. */ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - data_chunk_size = min(data_sinfo->chunk_size, - mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); - data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + if (!btrfs_is_zoned(fs_info)) { + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + } else { + data_chunk_size = data_sinfo->chunk_size; + } /* * Since data allocations immediately use block groups as part of the @@ -405,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + + /* + * On the zoned mode, we always allocate one zone as one chunk. + * Returning non-zone size alingned bytes here will result in + * less pressure for the async metadata reclaim process, and it + * will over-commit too much leading to ENOSPC. Align down to the + * zone size to avoid that. + */ + if (btrfs_is_zoned(fs_info)) + avail = ALIGN_DOWN(avail, fs_info->zone_size); + return avail; } diff --git a/fs/buffer.c b/fs/buffer.c index 8c19e705b9c3..dbe8f411ce52 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -258,7 +258,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } else { clear_buffer_uptodate(bh); buffer_io_error(bh, ", async page read"); - folio_set_error(folio); } /* @@ -391,7 +390,6 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate) buffer_io_error(bh, ", lost async page write"); mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); - folio_set_error(folio); } first = folio_buffers(folio); @@ -1960,7 +1958,6 @@ recover: clear_buffer_dirty(bh); } } while ((bh = bh->b_this_page) != head); - folio_set_error(folio); BUG_ON(folio_test_writeback(folio)); mapping_set_error(folio->mapping, err); folio_start_writeback(folio); @@ -2405,10 +2402,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); - if (err) { - folio_set_error(folio); + if (err) page_error = true; - } } if (!buffer_mapped(bh)) { folio_zero_range(folio, i * blocksize, diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index f449f7340aad..9fb06dc16520 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/statfs.h> #include <linux/namei.h> +#include <trace/events/fscache.h> #include "internal.h" /* @@ -312,19 +313,59 @@ static void cachefiles_withdraw_objects(struct cachefiles_cache *cache) } /* - * Withdraw volumes. + * Withdraw fscache volumes. + */ +static void cachefiles_withdraw_fscache_volumes(struct cachefiles_cache *cache) +{ + struct list_head *cur; + struct cachefiles_volume *volume; + struct fscache_volume *vcookie; + + _enter(""); +retry: + spin_lock(&cache->object_list_lock); + list_for_each(cur, &cache->volumes) { + volume = list_entry(cur, struct cachefiles_volume, cache_link); + + if (atomic_read(&volume->vcookie->n_accesses) == 0) + continue; + + vcookie = fscache_try_get_volume(volume->vcookie, + fscache_volume_get_withdraw); + if (vcookie) { + spin_unlock(&cache->object_list_lock); + fscache_withdraw_volume(vcookie); + fscache_put_volume(vcookie, fscache_volume_put_withdraw); + goto retry; + } + } + spin_unlock(&cache->object_list_lock); + + _leave(""); +} + +/* + * Withdraw cachefiles volumes. */ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache) { _enter(""); for (;;) { + struct fscache_volume *vcookie = NULL; struct cachefiles_volume *volume = NULL; spin_lock(&cache->object_list_lock); if (!list_empty(&cache->volumes)) { volume = list_first_entry(&cache->volumes, struct cachefiles_volume, cache_link); + vcookie = fscache_try_get_volume(volume->vcookie, + fscache_volume_get_withdraw); + if (!vcookie) { + spin_unlock(&cache->object_list_lock); + cpu_relax(); + continue; + } list_del_init(&volume->cache_link); } spin_unlock(&cache->object_list_lock); @@ -332,6 +373,7 @@ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache) break; cachefiles_withdraw_volume(volume); + fscache_put_volume(vcookie, fscache_volume_put_withdraw); } _leave(""); @@ -371,6 +413,7 @@ void cachefiles_withdraw_cache(struct cachefiles_cache *cache) pr_info("File cache on %s unregistering\n", fscache->name); fscache_withdraw_cache(fscache); + cachefiles_withdraw_fscache_volumes(cache); /* we now have to destroy all the active objects pertaining to this * cache - which we do by passing them off to thread pool to be diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 06cdf1a8a16f..89b11336a836 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -366,14 +366,14 @@ static __poll_t cachefiles_daemon_poll(struct file *file, if (cachefiles_in_ondemand_mode(cache)) { if (!xa_empty(&cache->reqs)) { - rcu_read_lock(); + xas_lock(&xas); xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { if (!cachefiles_ondemand_is_reopening_read(req)) { mask |= EPOLLIN; break; } } - rcu_read_unlock(); + xas_unlock(&xas); } } else { if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 6845a90cdfcc..7b99bd98de75 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -48,6 +48,7 @@ enum cachefiles_object_state { CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */ CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */ CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */ + CACHEFILES_ONDEMAND_OBJSTATE_DROPPING, /* Object is being dropped. */ }; struct cachefiles_ondemand_info { @@ -128,6 +129,7 @@ struct cachefiles_cache { unsigned long req_id_next; struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ u32 ondemand_id_next; + u32 msg_id_next; }; static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) @@ -335,6 +337,7 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE); CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING); +CACHEFILES_OBJECT_STATE_FUNCS(dropping, DROPPING); static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) { diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index bce005f2b456..470c96658385 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -517,7 +517,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, */ xas_lock(&xas); - if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + if (test_bit(CACHEFILES_DEAD, &cache->flags) || + cachefiles_ondemand_object_is_dropping(object)) { xas_unlock(&xas); ret = -EIO; goto out; @@ -527,20 +528,32 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, smp_mb(); if (opcode == CACHEFILES_OP_CLOSE && - !cachefiles_ondemand_object_is_open(object)) { + !cachefiles_ondemand_object_is_open(object)) { WARN_ON_ONCE(object->ondemand->ondemand_id == 0); xas_unlock(&xas); ret = -EIO; goto out; } - xas.xa_index = 0; + /* + * Cyclically find a free xas to avoid msg_id reuse that would + * cause the daemon to successfully copen a stale msg_id. + */ + xas.xa_index = cache->msg_id_next; xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) { + xas.xa_index = 0; + xas_find_marked(&xas, cache->msg_id_next - 1, XA_FREE_MARK); + } if (xas.xa_node == XAS_RESTART) xas_set_err(&xas, -EBUSY); + xas_store(&xas, req); - xas_clear_mark(&xas, XA_FREE_MARK); - xas_set_mark(&xas, CACHEFILES_REQ_NEW); + if (xas_valid(&xas)) { + cache->msg_id_next = xas.xa_index + 1; + xas_clear_mark(&xas, XA_FREE_MARK); + xas_set_mark(&xas, CACHEFILES_REQ_NEW); + } xas_unlock(&xas); } while (xas_nomem(&xas, GFP_KERNEL)); @@ -568,7 +581,8 @@ out: * If error occurs after creating the anonymous fd, * cachefiles_ondemand_fd_release() will set object to close. */ - if (opcode == CACHEFILES_OP_OPEN) + if (opcode == CACHEFILES_OP_OPEN && + !cachefiles_ondemand_object_is_dropping(object)) cachefiles_ondemand_set_object_close(object); kfree(req); return ret; @@ -667,8 +681,34 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) void cachefiles_ondemand_clean_object(struct cachefiles_object *object) { + unsigned long index; + struct cachefiles_req *req; + struct cachefiles_cache *cache; + + if (!object->ondemand) + return; + cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0, cachefiles_ondemand_init_close_req, NULL); + + if (!object->ondemand->ondemand_id) + return; + + /* Cancel all requests for the object that is being dropped. */ + cache = object->volume->cache; + xa_lock(&cache->reqs); + cachefiles_ondemand_set_object_dropping(object); + xa_for_each(&cache->reqs, index, req) { + if (req->object == object) { + req->error = -EIO; + complete(&req->done); + __xa_erase(&cache->reqs, index); + } + } + xa_unlock(&cache->reqs); + + /* Wait for ondemand_object_worker() to finish to avoid UAF. */ + cancel_work_sync(&object->ondemand->ondemand_work); } int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c index 89df0ba8ba5e..781aac4ef274 100644 --- a/fs/cachefiles/volume.c +++ b/fs/cachefiles/volume.c @@ -133,7 +133,6 @@ void cachefiles_free_volume(struct fscache_volume *vcookie) void cachefiles_withdraw_volume(struct cachefiles_volume *volume) { - fscache_withdraw_volume(volume->vcookie); cachefiles_set_volume_xattr(volume); __cachefiles_free_volume(volume); } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index bcb6173943ee..4dd8a993c60a 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -110,9 +110,11 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file if (xlen == 0) xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen); if (xlen != tlen) { - if (xlen < 0) + if (xlen < 0) { + ret = xlen; trace_cachefiles_vfs_error(object, file_inode(file), xlen, cachefiles_trace_getxattr_error); + } if (xlen == -EIO) cachefiles_io_error_obj( object, @@ -252,6 +254,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len); if (xlen != len) { if (xlen < 0) { + ret = xlen; trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen, cachefiles_trace_getxattr_error); if (xlen == -EIO) diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index ccdbec388091..40f84d014524 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -31,15 +31,7 @@ static int coda_symlink_filler(struct file *file, struct folio *folio) cii = ITOC(inode); error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); - if (error) - goto fail; - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; - -fail: - folio_set_error(folio); - folio_unlock(folio); + folio_end_read(folio, error == 0); return error; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 460690ca0174..b84d1747a020 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -811,19 +811,19 @@ out: static int cramfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; u32 maxblock; int bytes_filled; void *pgdata; + bool success = false; maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; bytes_filled = 0; - pgdata = kmap_local_page(page); + pgdata = kmap_local_folio(folio, 0); - if (page->index < maxblock) { + if (folio->index < maxblock) { struct super_block *sb = inode->i_sb; - u32 blkptr_offset = OFFSET(inode) + page->index * 4; + u32 blkptr_offset = OFFSET(inode) + folio->index * 4; u32 block_ptr, block_start, block_len; bool uncompressed, direct; @@ -844,7 +844,7 @@ static int cramfs_read_folio(struct file *file, struct folio *folio) if (uncompressed) { block_len = PAGE_SIZE; /* if last block: cap to file length */ - if (page->index == maxblock - 1) + if (folio->index == maxblock - 1) block_len = offset_in_page(inode->i_size); } else { @@ -861,7 +861,7 @@ static int cramfs_read_folio(struct file *file, struct folio *folio) * from the previous block's pointer. */ block_start = OFFSET(inode) + maxblock * 4; - if (page->index) + if (folio->index) block_start = *(u32 *) cramfs_read(sb, blkptr_offset - 4, 4); /* Beware... previous ptr might be a direct ptr */ @@ -906,17 +906,12 @@ static int cramfs_read_folio(struct file *file, struct folio *folio) } memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled); - flush_dcache_page(page); - kunmap_local(pgdata); - SetPageUptodate(page); - unlock_page(page); - return 0; + flush_dcache_folio(folio); + success = true; err: kunmap_local(pgdata); - ClearPageUptodate(page); - SetPageError(page); - unlock_page(page); + folio_end_read(folio, success); return 0; } @@ -1003,4 +998,5 @@ static void __exit exit_cramfs_fs(void) module_init(init_cramfs_fs) module_exit(exit_cramfs_fs) +MODULE_DESCRIPTION("Compressed ROM file system support"); MODULE_LICENSE("GPL"); diff --git a/fs/dcache.c b/fs/dcache.c index 407095188f83..8bdc278a0205 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -35,6 +35,8 @@ #include "internal.h" #include "mount.h" +#include <asm/runtime-const.h> + /* * Usage: * dcache->d_inode->i_lock protects: @@ -100,9 +102,10 @@ static unsigned int d_hash_shift __ro_after_init; static struct hlist_bl_head *dentry_hashtable __ro_after_init; -static inline struct hlist_bl_head *d_hash(unsigned int hash) +static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { - return dentry_hashtable + (hash >> d_hash_shift); + return runtime_const_ptr(dentry_hashtable) + + runtime_const_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 @@ -355,7 +358,11 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; - if (flags & DCACHE_LRU_LIST) + /* + * The negative counter only tracks dentries on the LRU. Don't inc if + * d_lru is on another list. + */ + if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } @@ -1548,7 +1555,7 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); + rwsem_assert_held_write(&sb->s_umount); dentry = sb->s_root; sb->s_root = NULL; @@ -1844,9 +1851,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) spin_lock(&dentry->d_lock); /* - * Decrement negative dentry count if it was in the LRU list. + * The negative counter only tracks dentries on the LRU. Don't dec if + * d_lru is on another list. */ - if (dentry->d_flags & DCACHE_LRU_LIST) + if ((dentry->d_flags & + (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); @@ -2104,7 +2113,7 @@ static noinline struct dentry *__d_lookup_rcu_op_compare( unsigned *seqp) { u64 hashlen = name->hash_len; - struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); + struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; @@ -2171,7 +2180,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, { u64 hashlen = name->hash_len; const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); + struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; @@ -3029,28 +3038,25 @@ EXPORT_SYMBOL(d_splice_alias); bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { - bool result; + bool subdir; unsigned seq; if (new_dentry == old_dentry) return true; - do { - /* for restarting inner loop in case of seq retry */ - seq = read_seqbegin(&rename_lock); - /* - * Need rcu_readlock to protect against the d_parent trashing - * due to d_move - */ - rcu_read_lock(); - if (d_ancestor(old_dentry, new_dentry)) - result = true; - else - result = false; - rcu_read_unlock(); - } while (read_seqretry(&rename_lock, seq)); - - return result; + /* Access d_parent under rcu as d_move() may change it. */ + rcu_read_lock(); + seq = read_seqbegin(&rename_lock); + subdir = d_ancestor(old_dentry, new_dentry); + /* Try lockless once... */ + if (read_seqretry(&rename_lock, seq)) { + /* ...else acquire lock for progress even on deep chains. */ + read_seqlock_excl(&rename_lock); + subdir = d_ancestor(old_dentry, new_dentry); + read_sequnlock_excl(&rename_lock); + } + rcu_read_unlock(); + return subdir; } EXPORT_SYMBOL(is_subdir); @@ -3100,6 +3106,34 @@ void d_tmpfile(struct file *file, struct inode *inode) } EXPORT_SYMBOL(d_tmpfile); +/* + * Obtain inode number of the parent dentry. + */ +ino_t d_parent_ino(struct dentry *dentry) +{ + struct dentry *parent; + struct inode *iparent; + unsigned seq; + ino_t ret; + + scoped_guard(rcu) { + seq = raw_seqcount_begin(&dentry->d_seq); + parent = READ_ONCE(dentry->d_parent); + iparent = d_inode_rcu(parent); + if (likely(iparent)) { + ret = iparent->i_ino; + if (!read_seqcount_retry(&dentry->d_seq, seq)) + return ret; + } + } + + spin_lock(&dentry->d_lock); + ret = dentry->d_parent->d_inode->i_ino; + spin_unlock(&dentry->d_lock); + return ret; +} +EXPORT_SYMBOL(d_parent_ino); + static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { @@ -3129,6 +3163,9 @@ static void __init dcache_init_early(void) 0, 0); d_hash_shift = 32 - d_hash_shift; + + runtime_const_init(shift, d_hash_shift); + runtime_const_init(ptr, dentry_hashtable); } static void __init dcache_init(void) @@ -3157,6 +3194,9 @@ static void __init dcache_init(void) 0, 0); d_hash_shift = 32 - d_hash_shift; + + runtime_const_init(shift, d_hash_shift); + runtime_const_init(ptr, dentry_hashtable); } /* SLAB cache for __getname() consumers */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 8fd928899a59..91521576f500 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -92,9 +92,9 @@ enum { }; static const struct fs_parameter_spec debugfs_param_specs[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), {} }; @@ -102,8 +102,6 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param { struct debugfs_fs_info *opts = fc->s_fs_info; struct fs_parse_result result; - kuid_t uid; - kgid_t gid; int opt; opt = fs_parse(fc, debugfs_param_specs, param, &result); @@ -120,16 +118,10 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param switch (opt) { case Opt_uid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - return invalf(fc, "Unknown uid"); - opts->uid = uid; + opts->uid = result.uid; break; case Opt_gid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) - return invalf(fc, "Unknown gid"); - opts->gid = gid; + opts->gid = result.gid; break; case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index bb14462f6d99..a929f1b613be 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -275,8 +275,8 @@ enum { }; static const struct fs_parameter_spec efivarfs_parameters[] = { - fsparam_u32("uid", Opt_uid), - fsparam_u32("gid", Opt_gid), + fsparam_uid("uid", Opt_uid), + fsparam_gid("gid", Opt_gid), {}, }; @@ -293,14 +293,10 @@ static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *para switch (opt) { case Opt_uid: - opts->uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(opts->uid)) - return -EINVAL; + opts->uid = result.uid; break; case Opt_gid: - opts->gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(opts->gid)) - return -EINVAL; + opts->gid = result.gid; break; default: return -EINVAL; diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 7844ab24b813..462619e59766 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -311,4 +311,5 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { return 0; } +MODULE_DESCRIPTION("Extent File System (efs)"); MODULE_LICENSE("GPL"); diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c index 3b03a573cb1a..7749feded722 100644 --- a/fs/efs/symlink.c +++ b/fs/efs/symlink.c @@ -14,10 +14,9 @@ static int efs_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - char *link = page_address(page); - struct buffer_head * bh; - struct inode * inode = page->mapping->host; + char *link = folio_address(folio); + struct buffer_head *bh; + struct inode *inode = folio->mapping->host; efs_block_t size = inode->i_size; int err; @@ -40,12 +39,9 @@ static int efs_symlink_read_folio(struct file *file, struct folio *folio) brelse(bh); } link[size] = '\0'; - SetPageUptodate(page); - unlock_page(page); - return 0; + err = 0; fail: - SetPageError(page); - unlock_page(page); + folio_end_read(folio, err == 0); return err; } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index c93bd24d2771..1b91d9513013 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -343,7 +343,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->build_time = le64_to_cpu(dsb->build_time); sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); - memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid)); + super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); ret = strscpy(sbi->volume_name, dsb->volume_name, sizeof(dsb->volume_name)); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 9b248ee5fef2..74d3d7bffcf3 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -711,6 +711,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, err = z_erofs_do_map_blocks(inode, map, flags); out: + if (err) + map->m_llen = 0; trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); return err; } diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 036024bce9f7..b80f612867c2 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -148,7 +148,7 @@ int __init z_erofs_gbuf_init(void) void z_erofs_gbuf_exit(void) { - int i; + int i, j; for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) { struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i]; @@ -161,9 +161,9 @@ void z_erofs_gbuf_exit(void) if (!gbuf->pages) continue; - for (i = 0; i < gbuf->nrpages; ++i) - if (gbuf->pages[i]) - put_page(gbuf->pages[i]); + for (j = 0; j < gbuf->nrpages; ++j) + if (gbuf->pages[j]) + put_page(gbuf->pages[j]); kfree(gbuf->pages); gbuf->pages = NULL; } diff --git a/fs/exec.c b/fs/exec.c index 40073142288f..4dee205452e2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -952,10 +952,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) path_noexec(&file->f_path))) goto exit; - err = deny_write_access(file); - if (err) - goto exit; - out: return file; @@ -971,8 +967,7 @@ exit: * * Returns ERR_PTR on failure or allocated struct file on success. * - * As this is a wrapper for the internal do_open_execat(), callers - * must call allow_write_access() before fput() on release. Also see + * As this is a wrapper for the internal do_open_execat(). Also see * do_close_execat(). */ struct file *open_exec(const char *name) @@ -1524,10 +1519,8 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) /* Matches do_open_execat() */ static void do_close_execat(struct file *file) { - if (!file) - return; - allow_write_access(file); - fput(file); + if (file) + fput(file); } static void free_bprm(struct linux_binprm *bprm) @@ -1846,7 +1839,6 @@ static int exec_binprm(struct linux_binprm *bprm) bprm->file = bprm->interpreter; bprm->interpreter = NULL; - allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 3d5ea2cfad66..a3c7173ef693 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -225,8 +225,8 @@ static const struct constant_table exfat_param_enums[] = { }; static const struct fs_parameter_spec exfat_parameters[] = { - fsparam_u32("uid", Opt_uid), - fsparam_u32("gid", Opt_gid), + fsparam_uid("uid", Opt_uid), + fsparam_gid("gid", Opt_gid), fsparam_u32oct("umask", Opt_umask), fsparam_u32oct("dmask", Opt_dmask), fsparam_u32oct("fmask", Opt_fmask), @@ -262,10 +262,10 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (opt) { case Opt_uid: - opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); + opts->fs_uid = result.uid; break; case Opt_gid: - opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); + opts->fs_gid = result.gid; break; case Opt_umask: opts->fs_fmask = result.uint_32; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 07ea3d62b298..4f2dd4ab4486 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -427,7 +427,7 @@ EXPORT_SYMBOL_GPL(exportfs_encode_fh); struct dentry * exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, - int fileid_type, + int fileid_type, unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context) { @@ -445,6 +445,11 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, if (IS_ERR_OR_NULL(result)) return result; + if ((flags & EXPORT_FH_DIR_ONLY) && !d_is_dir(result)) { + err = -ENOTDIR; + goto err_result; + } + /* * If no acceptance criteria was specified by caller, a disconnected * dentry is also accepatable. Callers may use this mode to query if @@ -581,7 +586,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, { struct dentry *ret; - ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, + ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, 0, acceptable, context); if (IS_ERR_OR_NULL(ret)) { if (ret == ERR_PTR(-ENOMEM)) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 7ae0b61258a7..0a056d97e640 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -31,11 +31,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, ext4_fname_from_fscrypt_name(fname, &name); -#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); if (err) ext4_fname_free_filename(fname); -#endif + return err; } @@ -51,11 +50,9 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, ext4_fname_from_fscrypt_name(fname, &name); -#if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); if (err) ext4_fname_free_filename(fname); -#endif return err; } @@ -70,10 +67,7 @@ void ext4_fname_free_filename(struct ext4_filename *fname) fname->usr_fname = NULL; fname->disk_name.name = NULL; -#if IS_ENABLED(CONFIG_UNICODE) - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; -#endif + ext4_fname_free_ci_filename(fname); } static bool uuid_is_zero(__u8 u[16]) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 983dad8c07ec..8007abd4972d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2511,7 +2511,7 @@ struct ext4_filename { struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) - struct fscrypt_str cf_name; + struct qstr cf_name; #endif }; @@ -2745,8 +2745,25 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, - const struct qstr *iname, - struct ext4_filename *fname); + const struct qstr *iname, + struct ext4_filename *fname); + +static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) +{ + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +} +#else +static inline int ext4_fname_setup_ci_filename(struct inode *dir, + const struct qstr *iname, + struct ext4_filename *fname) +{ + return 0; +} + +static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) +{ +} #endif /* ext4 encryption related stuff goes here crypto.c */ @@ -2769,16 +2786,11 @@ static inline int ext4_fname_setup_filename(struct inode *dir, int lookup, struct ext4_filename *fname) { - int err = 0; fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; -#if IS_ENABLED(CONFIG_UNICODE) - err = ext4_fname_setup_ci_filename(dir, iname, fname); -#endif - - return err; + return ext4_fname_setup_ci_filename(dir, iname, fname); } static inline int ext4_fname_prepare_lookup(struct inode *dir, @@ -2790,10 +2802,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, static inline void ext4_fname_free_filename(struct ext4_filename *fname) { -#if IS_ENABLED(CONFIG_UNICODE) - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; -#endif + ext4_fname_free_ci_filename(fname); } static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a630b27a4cc6..e6769b97a970 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1390,62 +1390,11 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) } #if IS_ENABLED(CONFIG_UNICODE) -/* - * Test whether a case-insensitive directory entry matches the filename - * being searched for. If quick is set, assume the name being looked up - * is already in the casefolded form. - * - * Returns: 0 if the directory entry matches, more than 0 if it - * doesn't match or less than zero on error. - */ -static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, - u8 *de_name, size_t de_name_len, bool quick) -{ - const struct super_block *sb = parent->i_sb; - const struct unicode_map *um = sb->s_encoding; - struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); - struct qstr entry = QSTR_INIT(de_name, de_name_len); - int ret; - - if (IS_ENCRYPTED(parent)) { - const struct fscrypt_str encrypted_name = - FSTR_INIT(de_name, de_name_len); - - decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); - if (!decrypted_name.name) - return -ENOMEM; - ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, - &decrypted_name); - if (ret < 0) - goto out; - entry.name = decrypted_name.name; - entry.len = decrypted_name.len; - } - - if (quick) - ret = utf8_strncasecmp_folded(um, name, &entry); - else - ret = utf8_strncasecmp(um, name, &entry); - if (ret < 0) { - /* Handle invalid character sequence as either an error - * or as an opaque byte sequence. - */ - if (sb_has_strict_encoding(sb)) - ret = -EINVAL; - else if (name->len != entry.len) - ret = 1; - else - ret = !!memcmp(name->name, entry.name, entry.len); - } -out: - kfree(decrypted_name.name); - return ret; -} - int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *name) { - struct fscrypt_str *cf_name = &name->cf_name; + struct qstr *cf_name = &name->cf_name; + unsigned char *buf; struct dx_hash_info *hinfo = &name->hinfo; int len; @@ -1455,18 +1404,18 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, return 0; } - cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); - if (!cf_name->name) + buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS); + if (!buf) return -ENOMEM; - len = utf8_casefold(dir->i_sb->s_encoding, - iname, cf_name->name, - EXT4_NAME_LEN); + len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN); if (len <= 0) { - kfree(cf_name->name); - cf_name->name = NULL; + kfree(buf); + buf = NULL; } + cf_name->name = buf; cf_name->len = (unsigned) len; + if (!IS_ENCRYPTED(dir)) return 0; @@ -1502,22 +1451,29 @@ static bool ext4_match(struct inode *parent, #if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { - if (fname->cf_name.name) { - struct qstr cf = {.name = fname->cf_name.name, - .len = fname->cf_name.len}; - if (IS_ENCRYPTED(parent)) { - if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || - fname->hinfo.minor_hash != - EXT4_DIRENT_MINOR_HASH(de)) { - - return false; - } - } - return !ext4_ci_compare(parent, &cf, de->name, - de->name_len, true); - } - return !ext4_ci_compare(parent, fname->usr_fname, de->name, - de->name_len, false); + /* + * Just checking IS_ENCRYPTED(parent) below is not + * sufficient to decide whether one can use the hash for + * skipping the string comparison, because the key might + * have been added right after + * ext4_fname_setup_ci_filename(). In this case, a hash + * mismatch will be a false negative. Therefore, make + * sure cf_name was properly initialized before + * considering the calculated hash. + */ + if (IS_ENCRYPTED(parent) && fname->cf_name.name && + (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || + fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) + return false; + /* + * Treat comparison errors as not a match. The + * only case where it happens is on a disk + * corruption or ENOMEM. + */ + + return generic_ci_match(parent, fname->usr_fname, + &fname->cf_name, de->name, + de->name_len) > 0; } #endif @@ -1869,8 +1825,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } } -#if IS_ENABLED(CONFIG_UNICODE) - if (!inode && IS_CASEFOLDED(dir)) { + if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry @@ -1878,7 +1833,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi */ return NULL; } -#endif + return d_splice_alias(inode, dentry); } @@ -3208,16 +3163,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); -#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ - if (IS_CASEFOLDED(dir)) + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); -#endif end_rmdir: brelse(bh); @@ -3319,16 +3272,15 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto out_trace; retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); -#if IS_ENABLED(CONFIG_UNICODE) + /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ - if (IS_CASEFOLDED(dir)) + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); -#endif out_trace: trace_ext4_unlink_exit(dentry, retval); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c682fb927b64..eb899628e121 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1721,8 +1721,8 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), - fsparam_u32 ("resgid", Opt_resgid), - fsparam_u32 ("resuid", Opt_resuid), + fsparam_gid ("resgid", Opt_resgid), + fsparam_uid ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), @@ -2127,8 +2127,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) struct fs_parse_result result; const struct mount_opts *m; int is_remount; - kuid_t uid; - kgid_t gid; int token; token = fs_parse(fc, ext4_param_specs, param, &result); @@ -2270,23 +2268,11 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) { - ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", - result.uint_32); - return -EINVAL; - } - ctx->s_resuid = uid; + ctx->s_resuid = result.uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) { - ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", - result.uint_32); - return -EINVAL; - } - ctx->s_resgid = gid; + ctx->s_resgid = result.gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: @@ -3586,14 +3572,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#if !IS_ENABLED(CONFIG_UNICODE) - if (ext4_has_feature_casefold(sb)) { + if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } -#endif if (readonly) return 1; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index ec2aeccb69a3..8bffdeccdbc3 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -219,8 +219,7 @@ static int f2fs_acl_update_mode(struct mnt_idmap *idmap, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; *mode_p = mode; return 0; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 02c9355176d3..cbd7a5e96a37 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -42,35 +42,49 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } +#if IS_ENABLED(CONFIG_UNICODE) /* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { -#if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; + unsigned char *buf; + int len; if (IS_CASEFOLDED(dir) && !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { - fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, - GFP_NOFS, false, F2FS_SB(sb)); - if (!fname->cf_name.name) + buf = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS, false, F2FS_SB(sb)); + if (!buf) return -ENOMEM; - fname->cf_name.len = utf8_casefold(sb->s_encoding, - fname->usr_fname, - fname->cf_name.name, - F2FS_NAME_LEN); - if ((int)fname->cf_name.len <= 0) { - kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); - fname->cf_name.name = NULL; + + len = utf8_casefold(sb->s_encoding, fname->usr_fname, + buf, F2FS_NAME_LEN); + if (len <= 0) { + kmem_cache_free(f2fs_cf_name_slab, buf); if (sb_has_strict_encoding(sb)) return -EINVAL; /* fall back to treating name as opaque byte sequence */ + return 0; } + fname->cf_name.name = buf; + fname->cf_name.len = len; } -#endif + return 0; } +void f2fs_free_casefolded_name(struct f2fs_filename *fname) +{ + unsigned char *buf = (unsigned char *)fname->cf_name.name; + + if (buf) { + kmem_cache_free(f2fs_cf_name_slab, buf); + fname->cf_name.name = NULL; + } +} +#endif /* CONFIG_UNICODE */ + static int __f2fs_setup_filename(const struct inode *dir, const struct fscrypt_name *crypt_name, struct f2fs_filename *fname) @@ -142,12 +156,7 @@ void f2fs_free_filename(struct f2fs_filename *fname) kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; #endif -#if IS_ENABLED(CONFIG_UNICODE) - if (fname->cf_name.name) { - kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); - fname->cf_name.name = NULL; - } -#endif + f2fs_free_casefolded_name(fname); } static unsigned long dir_block_index(unsigned int level, @@ -176,58 +185,6 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, return f2fs_find_target_dentry(&d, fname, max_slots); } -#if IS_ENABLED(CONFIG_UNICODE) -/* - * Test whether a case-insensitive directory entry matches the filename - * being searched for. - * - * Returns 1 for a match, 0 for no match, and -errno on an error. - */ -static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, - const u8 *de_name, u32 de_name_len) -{ - const struct super_block *sb = dir->i_sb; - const struct unicode_map *um = sb->s_encoding; - struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); - struct qstr entry = QSTR_INIT(de_name, de_name_len); - int res; - - if (IS_ENCRYPTED(dir)) { - const struct fscrypt_str encrypted_name = - FSTR_INIT((u8 *)de_name, de_name_len); - - if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))) - return -EINVAL; - - decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); - if (!decrypted_name.name) - return -ENOMEM; - res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name, - &decrypted_name); - if (res < 0) - goto out; - entry.name = decrypted_name.name; - entry.len = decrypted_name.len; - } - - res = utf8_strncasecmp_folded(um, name, &entry); - /* - * In strict mode, ignore invalid names. In non-strict mode, - * fall back to treating them as opaque byte sequences. - */ - if (res < 0 && !sb_has_strict_encoding(sb)) { - res = name->len == entry.len && - memcmp(name->name, entry.name, name->len) == 0; - } else { - /* utf8_strncasecmp_folded returns 0 on match */ - res = (res == 0); - } -out: - kfree(decrypted_name.name); - return res; -} -#endif /* CONFIG_UNICODE */ - static inline int f2fs_match_name(const struct inode *dir, const struct f2fs_filename *fname, const u8 *de_name, u32 de_name_len) @@ -235,11 +192,11 @@ static inline int f2fs_match_name(const struct inode *dir, struct fscrypt_name f; #if IS_ENABLED(CONFIG_UNICODE) - if (fname->cf_name.name) { - struct qstr cf = FSTR_TO_QSTR(&fname->cf_name); + if (fname->cf_name.name) + return generic_ci_match(dir, fname->usr_fname, + &fname->cf_name, + de_name, de_name_len); - return f2fs_match_ci_name(dir, &cf, de_name, de_name_len); - } #endif f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1974b6aff397..8a9d910aa552 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -531,7 +531,7 @@ struct f2fs_filename { * internal operation where usr_fname is also NULL. In all these cases * we fall back to treating the name as an opaque byte sequence. */ - struct fscrypt_str cf_name; + struct qstr cf_name; #endif }; @@ -3533,8 +3533,22 @@ int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, /* * dir.c */ +#if IS_ENABLED(CONFIG_UNICODE) int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname); +void f2fs_free_casefolded_name(struct f2fs_filename *fname); +#else +static inline int f2fs_init_casefolded_name(const struct inode *dir, + struct f2fs_filename *fname) +{ + return 0; +} + +static inline void f2fs_free_casefolded_name(struct f2fs_filename *fname) +{ +} +#endif /* CONFIG_UNICODE */ + int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct f2fs_filename *fname); int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c0b281a70f3..c1ad9b278c47 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -185,7 +185,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - *pino = parent_ino(dentry); + *pino = d_parent_ino(dentry); dput(dentry); return 1; } @@ -923,10 +923,8 @@ static void __setattr_copy(struct mnt_idmap *idmap, inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); - if (!vfsgid_in_group_p(vfsgid) && - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; set_acl_inode(inode, mode); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e54f8c08bda8..1ecde2b45e99 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -576,8 +576,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out_iput; } out_splice: -#if IS_ENABLED(CONFIG_UNICODE) - if (!inode && IS_CASEFOLDED(dir)) { + if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry @@ -586,7 +585,7 @@ out_splice: trace_f2fs_lookup_end(dir, dentry, ino, err); return NULL; } -#endif + new = d_splice_alias(inode, dentry); trace_f2fs_lookup_end(dir, !IS_ERR_OR_NULL(new) ? new : dentry, ino, IS_ERR(new) ? PTR_ERR(new) : err); @@ -639,16 +638,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, dir, inode); f2fs_unlock_op(sbi); -#if IS_ENABLED(CONFIG_UNICODE) /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at f2fs_lookup(), when it is better * supported by the VFS for the CI case. */ - if (IS_CASEFOLDED(dir)) + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); -#endif + if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 496aee53c38a..8712e264071f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -46,10 +46,6 @@ static struct kmem_cache *fsync_entry_slab; -#if IS_ENABLED(CONFIG_UNICODE) -extern struct kmem_cache *f2fs_cf_name_slab; -#endif - bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); @@ -153,11 +149,8 @@ static int init_recovered_filename(const struct inode *dir, if (err) return err; f2fs_hash_filename(dir, fname); -#if IS_ENABLED(CONFIG_UNICODE) /* Case-sensitive match is fine for recovery */ - kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); - fname->cf_name.name = NULL; -#endif + f2fs_free_casefolded_name(fname); } else { f2fs_hash_filename(dir, fname); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1f1b3647a998..df4cf31f93df 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -321,7 +321,7 @@ struct kmem_cache *f2fs_cf_name_slab; static int __init f2fs_create_casefold_cache(void) { f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", - F2FS_NAME_LEN); + F2FS_NAME_LEN); return f2fs_cf_name_slab ? 0 : -ENOMEM; } @@ -1326,13 +1326,13 @@ default_check: return -EINVAL; } #endif -#if !IS_ENABLED(CONFIG_UNICODE) - if (f2fs_sb_has_casefold(sbi)) { + + if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) { f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); return -EINVAL; } -#endif + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 66cf4778cf3b..d3e426de5f01 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -7,6 +7,8 @@ #include <linux/hash.h> #include <linux/ratelimit.h> #include <linux/msdos_fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> /* * vfat shortname flags @@ -51,7 +53,8 @@ struct fat_mount_options { tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ discard:1, /* Issue discard requests on deletions */ - dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */ + dos1xfloppy:1, /* Assume default BPB for DOS 1.x floppies */ + debug:1; /* Not currently used */ }; #define FAT_HASH_BITS 8 @@ -415,12 +418,21 @@ extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); extern struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos); extern int fat_sync_inode(struct inode *inode); -extern int fat_fill_super(struct super_block *sb, void *data, int silent, - int isvfat, void (*setup)(struct super_block *)); +extern int fat_fill_super(struct super_block *sb, struct fs_context *fc, + void (*setup)(struct super_block *)); extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); + +extern const struct fs_parameter_spec fat_param_spec[]; +int fat_init_fs_context(struct fs_context *fc, bool is_vfat); +void fat_free_fc(struct fs_context *fc); + +int fat_parse_param(struct fs_context *fc, struct fs_parameter *param, + bool is_vfat); +int fat_reconfigure(struct fs_context *fc); + static inline unsigned long fat_dir_hash(int logstart) { return hash_32(logstart, FAT_HASH_BITS); diff --git a/fs/fat/fat_test.c b/fs/fat/fat_test.c index 2dab4ca1d0d8..1f0062659067 100644 --- a/fs/fat/fat_test.c +++ b/fs/fat/fat_test.c @@ -193,4 +193,5 @@ static struct kunit_suite fat_test_suite = { kunit_test_suites(&fat_test_suite); +MODULE_DESCRIPTION("KUnit tests for FAT filesystems"); MODULE_LICENSE("GPL v2"); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d9e6fbb6f246..19115fd2d2a4 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -16,7 +16,6 @@ #include <linux/mpage.h> #include <linux/vfs.h> #include <linux/seq_file.h> -#include <linux/parser.h> #include <linux/uio.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -804,16 +803,17 @@ static void __exit fat_destroy_inodecache(void) kmem_cache_destroy(fat_inode_cachep); } -static int fat_remount(struct super_block *sb, int *flags, char *data) +int fat_reconfigure(struct fs_context *fc) { bool new_rdonly; + struct super_block *sb = fc->root->d_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); - *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); + fc->sb_flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); sync_filesystem(sb); /* make sure we update state on remount. */ - new_rdonly = *flags & SB_RDONLY; + new_rdonly = fc->sb_flags & SB_RDONLY; if (new_rdonly != sb_rdonly(sb)) { if (new_rdonly) fat_set_state(sb, 0, 0); @@ -822,6 +822,7 @@ static int fat_remount(struct super_block *sb, int *flags, char *data) } return 0; } +EXPORT_SYMBOL_GPL(fat_reconfigure); static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -939,8 +940,6 @@ static const struct super_operations fat_sops = { .evict_inode = fat_evict_inode, .put_super = fat_put_super, .statfs = fat_statfs, - .remount_fs = fat_remount, - .show_options = fat_show_options, }; @@ -1037,355 +1036,282 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) } enum { - Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid, - Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage, - Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug, - Opt_immutable, Opt_dots, Opt_nodots, - Opt_charset, Opt_shortname_lower, Opt_shortname_win95, - Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, - Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, - Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, - Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, - Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy, + Opt_check, Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, + Opt_allow_utime, Opt_codepage, Opt_usefree, Opt_nocase, Opt_quiet, + Opt_showexec, Opt_debug, Opt_immutable, Opt_dots, Opt_dotsOK, + Opt_charset, Opt_shortname, Opt_utf8, Opt_utf8_bool, + Opt_uni_xl, Opt_uni_xl_bool, Opt_nonumtail, Opt_nonumtail_bool, + Opt_obsolete, Opt_flush, Opt_tz, Opt_rodir, Opt_errors, Opt_discard, + Opt_nfs, Opt_nfs_enum, Opt_time_offset, Opt_dos1xfloppy, }; -static const match_table_t fat_tokens = { - {Opt_check_r, "check=relaxed"}, - {Opt_check_s, "check=strict"}, - {Opt_check_n, "check=normal"}, - {Opt_check_r, "check=r"}, - {Opt_check_s, "check=s"}, - {Opt_check_n, "check=n"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_umask, "umask=%o"}, - {Opt_dmask, "dmask=%o"}, - {Opt_fmask, "fmask=%o"}, - {Opt_allow_utime, "allow_utime=%o"}, - {Opt_codepage, "codepage=%u"}, - {Opt_usefree, "usefree"}, - {Opt_nocase, "nocase"}, - {Opt_quiet, "quiet"}, - {Opt_showexec, "showexec"}, - {Opt_debug, "debug"}, - {Opt_immutable, "sys_immutable"}, - {Opt_flush, "flush"}, - {Opt_tz_utc, "tz=UTC"}, - {Opt_time_offset, "time_offset=%d"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_discard, "discard"}, - {Opt_nfs_stale_rw, "nfs"}, - {Opt_nfs_stale_rw, "nfs=stale_rw"}, - {Opt_nfs_nostale_ro, "nfs=nostale_ro"}, - {Opt_dos1xfloppy, "dos1xfloppy"}, - {Opt_obsolete, "conv=binary"}, - {Opt_obsolete, "conv=text"}, - {Opt_obsolete, "conv=auto"}, - {Opt_obsolete, "conv=b"}, - {Opt_obsolete, "conv=t"}, - {Opt_obsolete, "conv=a"}, - {Opt_obsolete, "fat=%u"}, - {Opt_obsolete, "blocksize=%u"}, - {Opt_obsolete, "cvf_format=%20s"}, - {Opt_obsolete, "cvf_options=%100s"}, - {Opt_obsolete, "posix"}, - {Opt_err, NULL}, -}; -static const match_table_t msdos_tokens = { - {Opt_nodots, "nodots"}, - {Opt_nodots, "dotsOK=no"}, - {Opt_dots, "dots"}, - {Opt_dots, "dotsOK=yes"}, - {Opt_err, NULL} +static const struct constant_table fat_param_check[] = { + {"relaxed", 'r'}, + {"r", 'r'}, + {"strict", 's'}, + {"s", 's'}, + {"normal", 'n'}, + {"n", 'n'}, + {} }; -static const match_table_t vfat_tokens = { - {Opt_charset, "iocharset=%s"}, - {Opt_shortname_lower, "shortname=lower"}, - {Opt_shortname_win95, "shortname=win95"}, - {Opt_shortname_winnt, "shortname=winnt"}, - {Opt_shortname_mixed, "shortname=mixed"}, - {Opt_utf8_no, "utf8=0"}, /* 0 or no or false */ - {Opt_utf8_no, "utf8=no"}, - {Opt_utf8_no, "utf8=false"}, - {Opt_utf8_yes, "utf8=1"}, /* empty or 1 or yes or true */ - {Opt_utf8_yes, "utf8=yes"}, - {Opt_utf8_yes, "utf8=true"}, - {Opt_utf8_yes, "utf8"}, - {Opt_uni_xl_no, "uni_xlate=0"}, /* 0 or no or false */ - {Opt_uni_xl_no, "uni_xlate=no"}, - {Opt_uni_xl_no, "uni_xlate=false"}, - {Opt_uni_xl_yes, "uni_xlate=1"}, /* empty or 1 or yes or true */ - {Opt_uni_xl_yes, "uni_xlate=yes"}, - {Opt_uni_xl_yes, "uni_xlate=true"}, - {Opt_uni_xl_yes, "uni_xlate"}, - {Opt_nonumtail_no, "nonumtail=0"}, /* 0 or no or false */ - {Opt_nonumtail_no, "nonumtail=no"}, - {Opt_nonumtail_no, "nonumtail=false"}, - {Opt_nonumtail_yes, "nonumtail=1"}, /* empty or 1 or yes or true */ - {Opt_nonumtail_yes, "nonumtail=yes"}, - {Opt_nonumtail_yes, "nonumtail=true"}, - {Opt_nonumtail_yes, "nonumtail"}, - {Opt_rodir, "rodir"}, - {Opt_err, NULL} + +static const struct constant_table fat_param_tz[] = { + {"UTC", 0}, + {} }; -static int parse_options(struct super_block *sb, char *options, int is_vfat, - int silent, int *debug, struct fat_mount_options *opts) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char *iocharset; +static const struct constant_table fat_param_errors[] = { + {"continue", FAT_ERRORS_CONT}, + {"panic", FAT_ERRORS_PANIC}, + {"remount-ro", FAT_ERRORS_RO}, + {} +}; - opts->isvfat = is_vfat; - opts->fs_uid = current_uid(); - opts->fs_gid = current_gid(); - opts->fs_fmask = opts->fs_dmask = current_umask(); - opts->allow_utime = -1; - opts->codepage = fat_default_codepage; - fat_reset_iocharset(opts); - if (is_vfat) { - opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; - opts->rodir = 0; - } else { - opts->shortname = 0; - opts->rodir = 1; - } - opts->name_check = 'n'; - opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; - opts->unicode_xlate = 0; - opts->numtail = 1; - opts->usefree = opts->nocase = 0; - opts->tz_set = 0; - opts->nfs = 0; - opts->errors = FAT_ERRORS_RO; - *debug = 0; +static const struct constant_table fat_param_nfs[] = { + {"stale_rw", FAT_NFS_STALE_RW}, + {"nostale_ro", FAT_NFS_NOSTALE_RO}, + {} +}; - opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat; +/* + * These are all obsolete but we still reject invalid options. + * The corresponding values are therefore meaningless. + */ +static const struct constant_table fat_param_conv[] = { + {"binary", 0}, + {"text", 0}, + {"auto", 0}, + {"b", 0}, + {"t", 0}, + {"a", 0}, + {} +}; - if (!options) - goto out; +/* Core options. See below for vfat and msdos extras */ +const struct fs_parameter_spec fat_param_spec[] = { + fsparam_enum ("check", Opt_check, fat_param_check), + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("umask", Opt_umask), + fsparam_u32oct ("dmask", Opt_dmask), + fsparam_u32oct ("fmask", Opt_fmask), + fsparam_u32oct ("allow_utime", Opt_allow_utime), + fsparam_u32 ("codepage", Opt_codepage), + fsparam_flag ("usefree", Opt_usefree), + fsparam_flag ("nocase", Opt_nocase), + fsparam_flag ("quiet", Opt_quiet), + fsparam_flag ("showexec", Opt_showexec), + fsparam_flag ("debug", Opt_debug), + fsparam_flag ("sys_immutable", Opt_immutable), + fsparam_flag ("flush", Opt_flush), + fsparam_enum ("tz", Opt_tz, fat_param_tz), + fsparam_s32 ("time_offset", Opt_time_offset), + fsparam_enum ("errors", Opt_errors, fat_param_errors), + fsparam_flag ("discard", Opt_discard), + fsparam_flag ("nfs", Opt_nfs), + fsparam_enum ("nfs", Opt_nfs_enum, fat_param_nfs), + fsparam_flag ("dos1xfloppy", Opt_dos1xfloppy), + __fsparam(fs_param_is_enum, "conv", + Opt_obsolete, fs_param_deprecated, fat_param_conv), + __fsparam(fs_param_is_u32, "fat", + Opt_obsolete, fs_param_deprecated, NULL), + __fsparam(fs_param_is_u32, "blocksize", + Opt_obsolete, fs_param_deprecated, NULL), + __fsparam(fs_param_is_string, "cvf_format", + Opt_obsolete, fs_param_deprecated, NULL), + __fsparam(fs_param_is_string, "cvf_options", + Opt_obsolete, fs_param_deprecated, NULL), + __fsparam(NULL, "posix", + Opt_obsolete, fs_param_deprecated, NULL), + {} +}; +EXPORT_SYMBOL_GPL(fat_param_spec); - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; +static const struct fs_parameter_spec msdos_param_spec[] = { + fsparam_flag_no ("dots", Opt_dots), + fsparam_bool ("dotsOK", Opt_dotsOK), + {} +}; - token = match_token(p, fat_tokens, args); - if (token == Opt_err) { - if (is_vfat) - token = match_token(p, vfat_tokens, args); - else - token = match_token(p, msdos_tokens, args); - } - switch (token) { - case Opt_check_s: - opts->name_check = 's'; - break; - case Opt_check_r: - opts->name_check = 'r'; - break; - case Opt_check_n: - opts->name_check = 'n'; - break; - case Opt_usefree: - opts->usefree = 1; - break; - case Opt_nocase: - if (!is_vfat) - opts->nocase = 1; - else { - /* for backward compatibility */ - opts->shortname = VFAT_SFN_DISPLAY_WIN95 - | VFAT_SFN_CREATE_WIN95; - } - break; - case Opt_quiet: - opts->quiet = 1; - break; - case Opt_showexec: - opts->showexec = 1; - break; - case Opt_debug: - *debug = 1; - break; - case Opt_immutable: - opts->sys_immutable = 1; - break; - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(opts->fs_uid)) - return -EINVAL; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(opts->fs_gid)) - return -EINVAL; - break; - case Opt_umask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask = opts->fs_dmask = option; - break; - case Opt_dmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_dmask = option; - break; - case Opt_fmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask = option; - break; - case Opt_allow_utime: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->allow_utime = option & (S_IWGRP | S_IWOTH); - break; - case Opt_codepage: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->codepage = option; - break; - case Opt_flush: - opts->flush = 1; - break; - case Opt_time_offset: - if (match_int(&args[0], &option)) - return -EINVAL; - /* - * GMT+-12 zones may have DST corrections so at least - * 13 hours difference is needed. Make the limit 24 - * just in case someone invents something unusual. - */ - if (option < -24 * 60 || option > 24 * 60) - return -EINVAL; - opts->tz_set = 1; - opts->time_offset = option; - break; - case Opt_tz_utc: - opts->tz_set = 1; - opts->time_offset = 0; - break; - case Opt_err_cont: - opts->errors = FAT_ERRORS_CONT; - break; - case Opt_err_panic: - opts->errors = FAT_ERRORS_PANIC; - break; - case Opt_err_ro: - opts->errors = FAT_ERRORS_RO; - break; - case Opt_nfs_stale_rw: - opts->nfs = FAT_NFS_STALE_RW; - break; - case Opt_nfs_nostale_ro: - opts->nfs = FAT_NFS_NOSTALE_RO; - break; - case Opt_dos1xfloppy: - opts->dos1xfloppy = 1; - break; +static const struct constant_table fat_param_shortname[] = { + {"lower", VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95}, + {"win95", VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95}, + {"winnt", VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT}, + {"mixed", VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95}, + {} +}; - /* msdos specific */ - case Opt_dots: - opts->dotsOK = 1; - break; - case Opt_nodots: - opts->dotsOK = 0; - break; +static const struct fs_parameter_spec vfat_param_spec[] = { + fsparam_string ("iocharset", Opt_charset), + fsparam_enum ("shortname", Opt_shortname, fat_param_shortname), + fsparam_flag ("utf8", Opt_utf8), + fsparam_bool ("utf8", Opt_utf8_bool), + fsparam_flag ("uni_xlate", Opt_uni_xl), + fsparam_bool ("uni_xlate", Opt_uni_xl_bool), + fsparam_flag ("nonumtail", Opt_nonumtail), + fsparam_bool ("nonumtail", Opt_nonumtail_bool), + fsparam_flag ("rodir", Opt_rodir), + {} +}; - /* vfat specific */ - case Opt_charset: - fat_reset_iocharset(opts); - iocharset = match_strdup(&args[0]); - if (!iocharset) - return -ENOMEM; - opts->iocharset = iocharset; - break; - case Opt_shortname_lower: - opts->shortname = VFAT_SFN_DISPLAY_LOWER - | VFAT_SFN_CREATE_WIN95; - break; - case Opt_shortname_win95: - opts->shortname = VFAT_SFN_DISPLAY_WIN95 - | VFAT_SFN_CREATE_WIN95; - break; - case Opt_shortname_winnt: - opts->shortname = VFAT_SFN_DISPLAY_WINNT - | VFAT_SFN_CREATE_WINNT; - break; - case Opt_shortname_mixed: - opts->shortname = VFAT_SFN_DISPLAY_WINNT - | VFAT_SFN_CREATE_WIN95; - break; - case Opt_utf8_no: /* 0 or no or false */ - opts->utf8 = 0; - break; - case Opt_utf8_yes: /* empty or 1 or yes or true */ - opts->utf8 = 1; - break; - case Opt_uni_xl_no: /* 0 or no or false */ - opts->unicode_xlate = 0; - break; - case Opt_uni_xl_yes: /* empty or 1 or yes or true */ - opts->unicode_xlate = 1; - break; - case Opt_nonumtail_no: /* 0 or no or false */ - opts->numtail = 1; /* negated option */ - break; - case Opt_nonumtail_yes: /* empty or 1 or yes or true */ - opts->numtail = 0; /* negated option */ - break; - case Opt_rodir: - opts->rodir = 1; - break; - case Opt_discard: - opts->discard = 1; - break; +int fat_parse_param(struct fs_context *fc, struct fs_parameter *param, + bool is_vfat) +{ + struct fat_mount_options *opts = fc->fs_private; + struct fs_parse_result result; + int opt; - /* obsolete mount options */ - case Opt_obsolete: - fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, " - "not supported now", p); - break; - /* unknown option */ - default: - if (!silent) { - fat_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" " - "or missing value", p); - } - return -EINVAL; - } - } + /* remount options have traditionally been ignored */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; -out: - /* UTF-8 doesn't provide FAT semantics */ - if (!strcmp(opts->iocharset, "utf8")) { - fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset" - " for FAT filesystems, filesystem will be " - "case sensitive!"); + opt = fs_parse(fc, fat_param_spec, param, &result); + /* If option not found in fat_param_spec, try vfat/msdos options */ + if (opt == -ENOPARAM) { + if (is_vfat) + opt = fs_parse(fc, vfat_param_spec, param, &result); + else + opt = fs_parse(fc, msdos_param_spec, param, &result); } - /* If user doesn't specify allow_utime, it's initialized from dmask. */ - if (opts->allow_utime == (unsigned short)-1) - opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); - if (opts->unicode_xlate) - opts->utf8 = 0; - if (opts->nfs == FAT_NFS_NOSTALE_RO) { - sb->s_flags |= SB_RDONLY; - sb->s_export_op = &fat_export_ops_nostale; + if (opt < 0) + return opt; + + switch (opt) { + case Opt_check: + opts->name_check = result.uint_32; + break; + case Opt_usefree: + opts->usefree = 1; + break; + case Opt_nocase: + if (!is_vfat) + opts->nocase = 1; + else { + /* for backward compatibility */ + opts->shortname = VFAT_SFN_DISPLAY_WIN95 + | VFAT_SFN_CREATE_WIN95; + } + break; + case Opt_quiet: + opts->quiet = 1; + break; + case Opt_showexec: + opts->showexec = 1; + break; + case Opt_debug: + opts->debug = 1; + break; + case Opt_immutable: + opts->sys_immutable = 1; + break; + case Opt_uid: + opts->fs_uid = result.uid; + break; + case Opt_gid: + opts->fs_gid = result.gid; + break; + case Opt_umask: + opts->fs_fmask = opts->fs_dmask = result.uint_32; + break; + case Opt_dmask: + opts->fs_dmask = result.uint_32; + break; + case Opt_fmask: + opts->fs_fmask = result.uint_32; + break; + case Opt_allow_utime: + opts->allow_utime = result.uint_32 & (S_IWGRP | S_IWOTH); + break; + case Opt_codepage: + opts->codepage = result.uint_32; + break; + case Opt_flush: + opts->flush = 1; + break; + case Opt_time_offset: + /* + * GMT+-12 zones may have DST corrections so at least + * 13 hours difference is needed. Make the limit 24 + * just in case someone invents something unusual. + */ + if (result.int_32 < -24 * 60 || result.int_32 > 24 * 60) + return -EINVAL; + opts->tz_set = 1; + opts->time_offset = result.int_32; + break; + case Opt_tz: + opts->tz_set = 1; + opts->time_offset = result.uint_32; + break; + case Opt_errors: + opts->errors = result.uint_32; + break; + case Opt_nfs: + opts->nfs = FAT_NFS_STALE_RW; + break; + case Opt_nfs_enum: + opts->nfs = result.uint_32; + break; + case Opt_dos1xfloppy: + opts->dos1xfloppy = 1; + break; + + /* msdos specific */ + case Opt_dots: /* dots / nodots */ + opts->dotsOK = !result.negated; + break; + case Opt_dotsOK: /* dotsOK = yes/no */ + opts->dotsOK = result.boolean; + break; + + /* vfat specific */ + case Opt_charset: + fat_reset_iocharset(opts); + opts->iocharset = param->string; + param->string = NULL; /* Steal string */ + break; + case Opt_shortname: + opts->shortname = result.uint_32; + break; + case Opt_utf8: + opts->utf8 = 1; + break; + case Opt_utf8_bool: + opts->utf8 = result.boolean; + break; + case Opt_uni_xl: + opts->unicode_xlate = 1; + break; + case Opt_uni_xl_bool: + opts->unicode_xlate = result.boolean; + break; + case Opt_nonumtail: + opts->numtail = 0; /* negated option */ + break; + case Opt_nonumtail_bool: + opts->numtail = !result.boolean; /* negated option */ + break; + case Opt_rodir: + opts->rodir = 1; + break; + case Opt_discard: + opts->discard = 1; + break; + + /* obsolete mount options */ + case Opt_obsolete: + printk(KERN_INFO "FAT-fs: \"%s\" option is obsolete, " + "not supported now", param->key); + break; + default: + return -EINVAL; } return 0; } +EXPORT_SYMBOL_GPL(fat_parse_param); static int fat_read_root(struct inode *inode) { @@ -1604,9 +1530,11 @@ out: /* * Read the super block of an MS-DOS FS. */ -int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, +int fat_fill_super(struct super_block *sb, struct fs_context *fc, void (*setup)(struct super_block *)) { + struct fat_mount_options *opts = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct inode *root_inode = NULL, *fat_inode = NULL; struct inode *fsinfo_inode = NULL; struct buffer_head *bh; @@ -1614,7 +1542,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, struct msdos_sb_info *sbi; u16 logical_sector_size; u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors; - int debug; long error; char buf[50]; struct timespec64 ts; @@ -1643,9 +1570,27 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options); - if (error) - goto out_fail; + /* UTF-8 doesn't provide FAT semantics */ + if (!strcmp(opts->iocharset, "utf8")) { + fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset" + " for FAT filesystems, filesystem will be" + " case sensitive!"); + } + + /* If user doesn't specify allow_utime, it's initialized from dmask. */ + if (opts->allow_utime == (unsigned short)-1) + opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); + if (opts->unicode_xlate) + opts->utf8 = 0; + if (opts->nfs == FAT_NFS_NOSTALE_RO) { + sb->s_flags |= SB_RDONLY; + sb->s_export_op = &fat_export_ops_nostale; + } + + /* Apply parsed options to sbi (structure copy) */ + sbi->options = *opts; + /* Transfer ownership of iocharset to sbi->options */ + opts->iocharset = NULL; setup(sb); /* flavour-specific stuff that needs options */ @@ -1950,6 +1895,57 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) } EXPORT_SYMBOL_GPL(fat_flush_inodes); +int fat_init_fs_context(struct fs_context *fc, bool is_vfat) +{ + struct fat_mount_options *opts; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + opts->isvfat = is_vfat; + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask = opts->fs_dmask = current_umask(); + opts->allow_utime = -1; + opts->codepage = fat_default_codepage; + fat_reset_iocharset(opts); + if (is_vfat) { + opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; + opts->rodir = 0; + } else { + opts->shortname = 0; + opts->rodir = 1; + } + opts->name_check = 'n'; + opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; + opts->unicode_xlate = 0; + opts->numtail = 1; + opts->usefree = opts->nocase = 0; + opts->tz_set = 0; + opts->nfs = 0; + opts->errors = FAT_ERRORS_RO; + opts->debug = 0; + + opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat; + + fc->fs_private = opts; + /* fc->ops assigned by caller */ + + return 0; +} +EXPORT_SYMBOL_GPL(fat_init_fs_context); + +void fat_free_fc(struct fs_context *fc) +{ + struct fat_mount_options *opts = fc->fs_private; + + if (opts->iocharset != fat_default_iocharset) + kfree(opts->iocharset); + kfree(fc->fs_private); +} +EXPORT_SYMBOL_GPL(fat_free_fc); + static int __init init_fat_fs(void) { int err; @@ -1978,4 +1974,5 @@ static void __exit exit_fat_fs(void) module_init(init_fat_fs) module_exit(exit_fat_fs) +MODULE_DESCRIPTION("Core FAT filesystem support"); MODULE_LICENSE("GPL"); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 2116c486843b..f06f6ba643cc 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -650,24 +650,48 @@ static void setup(struct super_block *sb) sb->s_flags |= SB_NOATIME; } -static int msdos_fill_super(struct super_block *sb, void *data, int silent) +static int msdos_fill_super(struct super_block *sb, struct fs_context *fc) { - return fat_fill_super(sb, data, silent, 0, setup); + return fat_fill_super(sb, fc, setup); } -static struct dentry *msdos_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int msdos_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super); + return get_tree_bdev(fc, msdos_fill_super); +} + +static int msdos_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + return fat_parse_param(fc, param, false); +} + +static const struct fs_context_operations msdos_context_ops = { + .parse_param = msdos_parse_param, + .get_tree = msdos_get_tree, + .reconfigure = fat_reconfigure, + .free = fat_free_fc, +}; + +static int msdos_init_fs_context(struct fs_context *fc) +{ + int err; + + /* Initialize with is_vfat == false */ + err = fat_init_fs_context(fc, false); + if (err) + return err; + + fc->ops = &msdos_context_ops; + return 0; } static struct file_system_type msdos_fs_type = { .owner = THIS_MODULE, .name = "msdos", - .mount = msdos_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .init_fs_context = msdos_init_fs_context, + .parameters = fat_param_spec, }; MODULE_ALIAS_FS("msdos"); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index c4d00999a433..6423e1dedf14 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1195,24 +1195,48 @@ static void setup(struct super_block *sb) sb->s_d_op = &vfat_dentry_ops; } -static int vfat_fill_super(struct super_block *sb, void *data, int silent) +static int vfat_fill_super(struct super_block *sb, struct fs_context *fc) { - return fat_fill_super(sb, data, silent, 1, setup); + return fat_fill_super(sb, fc, setup); } -static struct dentry *vfat_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int vfat_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super); + return get_tree_bdev(fc, vfat_fill_super); +} + +static int vfat_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + return fat_parse_param(fc, param, true); +} + +static const struct fs_context_operations vfat_context_ops = { + .parse_param = vfat_parse_param, + .get_tree = vfat_get_tree, + .reconfigure = fat_reconfigure, + .free = fat_free_fc, +}; + +static int vfat_init_fs_context(struct fs_context *fc) +{ + int err; + + /* Initialize with is_vfat == true */ + err = fat_init_fs_context(fc, true); + if (err) + return err; + + fc->ops = &vfat_context_ops; + return 0; } static struct file_system_type vfat_fs_type = { .owner = THIS_MODULE, .name = "vfat", - .mount = vfat_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .init_fs_context = vfat_init_fs_context, + .parameters = fat_param_spec, }; MODULE_ALIAS_FS("vfat"); diff --git a/fs/fhandle.c b/fs/fhandle.c index 8a7f86c2139a..6e8cea16790e 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -115,88 +115,188 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, return err; } -static struct vfsmount *get_vfsmount_from_fd(int fd) +static int get_path_from_fd(int fd, struct path *root) { - struct vfsmount *mnt; - if (fd == AT_FDCWD) { struct fs_struct *fs = current->fs; spin_lock(&fs->lock); - mnt = mntget(fs->pwd.mnt); + *root = fs->pwd; + path_get(root); spin_unlock(&fs->lock); } else { struct fd f = fdget(fd); if (!f.file) - return ERR_PTR(-EBADF); - mnt = mntget(f.file->f_path.mnt); + return -EBADF; + *root = f.file->f_path; + path_get(root); fdput(f); } - return mnt; + + return 0; } +enum handle_to_path_flags { + HANDLE_CHECK_PERMS = (1 << 0), + HANDLE_CHECK_SUBTREE = (1 << 1), +}; + +struct handle_to_path_ctx { + struct path root; + enum handle_to_path_flags flags; + unsigned int fh_flags; +}; + static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { - return 1; + struct handle_to_path_ctx *ctx = context; + struct user_namespace *user_ns = current_user_ns(); + struct dentry *d, *root = ctx->root.dentry; + struct mnt_idmap *idmap = mnt_idmap(ctx->root.mnt); + int retval = 0; + + if (!root) + return 1; + + /* Old permission model with global CAP_DAC_READ_SEARCH. */ + if (!ctx->flags) + return 1; + + /* + * It's racy as we're not taking rename_lock but we're able to ignore + * permissions and we just need an approximation whether we were able + * to follow a path to the file. + * + * It's also potentially expensive on some filesystems especially if + * there is a deep path. + */ + d = dget(dentry); + while (d != root && !IS_ROOT(d)) { + struct dentry *parent = dget_parent(d); + + /* + * We know that we have the ability to override DAC permissions + * as we've verified this earlier via CAP_DAC_READ_SEARCH. But + * we also need to make sure that there aren't any unmapped + * inodes in the path that would prevent us from reaching the + * file. + */ + if (!privileged_wrt_inode_uidgid(user_ns, idmap, + d_inode(parent))) { + dput(d); + dput(parent); + return retval; + } + + dput(d); + d = parent; + } + + if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root) + retval = 1; + WARN_ON_ONCE(d != root && d != root->d_sb->s_root); + dput(d); + return retval; } -static int do_handle_to_path(int mountdirfd, struct file_handle *handle, - struct path *path) +static int do_handle_to_path(struct file_handle *handle, struct path *path, + struct handle_to_path_ctx *ctx) { - int retval = 0; int handle_dwords; + struct vfsmount *mnt = ctx->root.mnt; - path->mnt = get_vfsmount_from_fd(mountdirfd); - if (IS_ERR(path->mnt)) { - retval = PTR_ERR(path->mnt); - goto out_err; - } /* change the handle size to multiple of sizeof(u32) */ handle_dwords = handle->handle_bytes >> 2; - path->dentry = exportfs_decode_fh(path->mnt, + path->dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle, handle_dwords, handle->handle_type, - vfs_dentry_acceptable, NULL); - if (IS_ERR(path->dentry)) { - retval = PTR_ERR(path->dentry); - goto out_mnt; + ctx->fh_flags, + vfs_dentry_acceptable, ctx); + if (IS_ERR_OR_NULL(path->dentry)) { + if (path->dentry == ERR_PTR(-ENOMEM)) + return -ENOMEM; + return -ESTALE; } + path->mnt = mntget(mnt); return 0; -out_mnt: - mntput(path->mnt); -out_err: - return retval; +} + +/* + * Allow relaxed permissions of file handles if the caller has the + * ability to mount the filesystem or create a bind-mount of the + * provided @mountdirfd. + * + * In both cases the caller may be able to get an unobstructed way to + * the encoded file handle. If the caller is only able to create a + * bind-mount we need to verify that there are no locked mounts on top + * of it that could prevent us from getting to the encoded file. + * + * In principle, locked mounts can prevent the caller from mounting the + * filesystem but that only applies to procfs and sysfs neither of which + * support decoding file handles. + */ +static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, + unsigned int o_flags) +{ + struct path *root = &ctx->root; + + /* + * Restrict to O_DIRECTORY to provide a deterministic API that avoids a + * confusing api in the face of disconnected non-dir dentries. + * + * There's only one dentry for each directory inode (VFS rule)... + */ + if (!(o_flags & O_DIRECTORY)) + return false; + + if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) + ctx->flags = HANDLE_CHECK_PERMS; + else if (is_mounted(root->mnt) && + ns_capable(real_mount(root->mnt)->mnt_ns->user_ns, + CAP_SYS_ADMIN) && + !has_locked_children(real_mount(root->mnt), root->dentry)) + ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; + else + return false; + + /* Are we able to override DAC permissions? */ + if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) + return false; + + ctx->fh_flags = EXPORT_FH_DIR_ONLY; + return true; } static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, - struct path *path) + struct path *path, unsigned int o_flags) { int retval = 0; struct file_handle f_handle; struct file_handle *handle = NULL; + struct handle_to_path_ctx ctx = {}; - /* - * With handle we don't look at the execute bit on the - * directory. Ideally we would like CAP_DAC_SEARCH. - * But we don't have that - */ - if (!capable(CAP_DAC_READ_SEARCH)) { - retval = -EPERM; + retval = get_path_from_fd(mountdirfd, &ctx.root); + if (retval) goto out_err; + + if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) { + retval = -EPERM; + goto out_path; } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { retval = -EFAULT; - goto out_err; + goto out_path; } if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || (f_handle.handle_bytes == 0)) { retval = -EINVAL; - goto out_err; + goto out_path; } handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { retval = -ENOMEM; - goto out_err; + goto out_path; } /* copy the full handle */ *handle = f_handle; @@ -207,10 +307,12 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, goto out_handle; } - retval = do_handle_to_path(mountdirfd, handle, path); + retval = do_handle_to_path(handle, path, &ctx); out_handle: kfree(handle); +out_path: + path_put(&ctx.root); out_err: return retval; } @@ -223,7 +325,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, struct file *file; int fd; - retval = handle_to_path(mountdirfd, ufh, &path); + retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; diff --git a/fs/fs_parser.c b/fs/fs_parser.c index a4d6ca0b8971..24727ec34e5a 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -308,6 +308,40 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, } EXPORT_SYMBOL(fs_param_is_fd); +int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p, + struct fs_parameter *param, struct fs_parse_result *result) +{ + kuid_t uid; + + if (fs_param_is_u32(log, p, param, result) != 0) + return fs_param_bad_value(log, param); + + uid = make_kuid(current_user_ns(), result->uint_32); + if (!uid_valid(uid)) + return inval_plog(log, "Invalid uid '%s'", param->string); + + result->uid = uid; + return 0; +} +EXPORT_SYMBOL(fs_param_is_uid); + +int fs_param_is_gid(struct p_log *log, const struct fs_parameter_spec *p, + struct fs_parameter *param, struct fs_parse_result *result) +{ + kgid_t gid; + + if (fs_param_is_u32(log, p, param, result) != 0) + return fs_param_bad_value(log, param); + + gid = make_kgid(current_user_ns(), result->uint_32); + if (!gid_valid(gid)) + return inval_plog(log, "Invalid gid '%s'", param->string); + + result->gid = gid; + return 0; +} +EXPORT_SYMBOL(fs_param_is_gid); + int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { diff --git a/fs/fsopen.c b/fs/fsopen.c index 6593ae518115..ed2dd000622e 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -220,10 +220,6 @@ static int vfs_cmd_create(struct fs_context *fc, bool exclusive) if (!mount_capable(fc)) return -EPERM; - /* require the new mount api */ - if (exclusive && fc->ops == &legacy_fs_context_ops) - return -EOPNOTSUPP; - fc->phase = FS_CONTEXT_CREATING; fc->exclusive = exclusive; @@ -411,6 +407,7 @@ SYSCALL_DEFINE5(fsconfig, case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: case FSCONFIG_SET_FD: + case FSCONFIG_CMD_CREATE_EXCL: ret = -EOPNOTSUPP; goto out_f; } @@ -451,7 +448,7 @@ SYSCALL_DEFINE5(fsconfig, fallthrough; case FSCONFIG_SET_PATH: param.type = fs_value_is_filename; - param.name = getname_flags(_value, lookup_flags, NULL); + param.name = getname_flags(_value, lookup_flags); if (IS_ERR(param.name)) { ret = PTR_ERR(param.name); goto out_key; diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 3d192b80a561..04cfd8fee992 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -146,8 +146,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * be stripped. */ if (fc->posix_acl && - !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && - !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) + !in_group_or_capable(&nop_mnt_idmap, inode, + i_gid_into_vfsgid(&nop_mnt_idmap, inode))) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 99e44ea7d875..d8ab4e93916f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -740,8 +740,8 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = { fsparam_string ("source", OPT_SOURCE), fsparam_u32 ("fd", OPT_FD), fsparam_u32oct ("rootmode", OPT_ROOTMODE), - fsparam_u32 ("user_id", OPT_USER_ID), - fsparam_u32 ("group_id", OPT_GROUP_ID), + fsparam_uid ("user_id", OPT_USER_ID), + fsparam_gid ("group_id", OPT_GROUP_ID), fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), fsparam_flag ("allow_other", OPT_ALLOW_OTHER), fsparam_u32 ("max_read", OPT_MAX_READ), @@ -755,6 +755,8 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) struct fs_parse_result result; struct fuse_fs_context *ctx = fsc->fs_private; int opt; + kuid_t kuid; + kgid_t kgid; if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { /* @@ -799,16 +801,26 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) break; case OPT_USER_ID: - ctx->user_id = make_kuid(fsc->user_ns, result.uint_32); - if (!uid_valid(ctx->user_id)) + kuid = result.uid; + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fsc->user_ns, kuid)) return invalfc(fsc, "Invalid user_id"); + ctx->user_id = kuid; ctx->user_id_present = true; break; case OPT_GROUP_ID: - ctx->group_id = make_kgid(fsc->user_ns, result.uint_32); - if (!gid_valid(ctx->group_id)) + kgid = result.gid; + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fsc->user_ns, kgid)) return invalfc(fsc, "Invalid group_id"); + ctx->group_id = kgid; ctx->group_id_present = true; break; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 8c34798a0715..744e10b46904 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -200,6 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; + HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; if (S_ISDIR(mode)) { inode->i_size = 2; HFS_SB(sb)->folder_count++; @@ -275,6 +276,8 @@ void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, for (count = 0, i = 0; i < 3; i++) count += be16_to_cpu(ext[i].count); HFS_I(inode)->first_blocks = count; + HFS_I(inode)->cached_start = 0; + HFS_I(inode)->cached_blocks = 0; inode->i_size = HFS_I(inode)->phys_size = log_size; HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 6764afa98a6f..eeac99765f0d 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -28,6 +28,7 @@ static struct kmem_cache *hfs_inode_cachep; +MODULE_DESCRIPTION("Apple Macintosh file system support"); MODULE_LICENSE("GPL"); static int hfs_sync_fs(struct super_block *sb, int wait) diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index ca2ba8c9f82e..901e83d65d20 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -25,19 +25,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) fd->key = ptr + tree->max_key_len + 2; hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); - switch (tree->cnid) { - case HFSPLUS_CAT_CNID: - mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); - break; - case HFSPLUS_EXT_CNID: - mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); - break; - case HFSPLUS_ATTR_CNID: - mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); - break; - default: - BUG(); - } + mutex_lock_nested(&tree->tree_lock, + hfsplus_btree_lock_class(tree)); return 0; } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 3c572e44f2ad..9c51867dddc5 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -430,7 +430,8 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, hfsplus_free_extents(sb, ext_entry, total_blocks - start, total_blocks); total_blocks = start; - mutex_lock(&fd.tree->tree_lock); + mutex_lock_nested(&fd.tree->tree_lock, + hfsplus_btree_lock_class(fd.tree)); } while (total_blocks > blocks); hfs_find_exit(&fd); @@ -592,7 +593,8 @@ void hfsplus_file_truncate(struct inode *inode) alloc_cnt, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->first_extents); hip->first_blocks = blk_cnt; - mutex_lock(&fd.tree->tree_lock); + mutex_lock_nested(&fd.tree->tree_lock, + hfsplus_btree_lock_class(fd.tree)); break; } res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); @@ -606,7 +608,8 @@ void hfsplus_file_truncate(struct inode *inode) hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->cached_extents); - mutex_lock(&fd.tree->tree_lock); + mutex_lock_nested(&fd.tree->tree_lock, + hfsplus_btree_lock_class(fd.tree)); if (blk_cnt > start) { hip->extent_state |= HFSPLUS_EXT_DIRTY; break; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 012a3d003fbe..9e78f181c24f 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -553,6 +553,27 @@ static inline __be32 __hfsp_ut2mt(time64_t ut) return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET); } +static inline enum hfsplus_btree_mutex_classes +hfsplus_btree_lock_class(struct hfs_btree *tree) +{ + enum hfsplus_btree_mutex_classes class; + + switch (tree->cnid) { + case HFSPLUS_CAT_CNID: + class = CATALOG_BTREE_MUTEX; + break; + case HFSPLUS_EXT_CNID: + class = EXTENTS_BTREE_MUTEX; + break; + case HFSPLUS_ATTR_CNID: + class = ATTR_BTREE_MUTEX; + break; + default: + BUG(); + } + return class; +} + /* compatibility */ #define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) } #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 5661a2e24d03..40d04dba13ac 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -40,7 +40,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) /* Directory containing the bootable system */ vh->finder_info[0] = bvh->finder_info[0] = - cpu_to_be32(parent_ino(dentry)); + cpu_to_be32(d_parent_ino(dentry)); /* * Bootloader. Just using the inode here breaks in the case of @@ -51,7 +51,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) /* Per spec, the OS X system folder - same as finder_info[0] here */ vh->finder_info[5] = bvh->finder_info[5] = - cpu_to_be32(parent_ino(dentry)); + cpu_to_be32(d_parent_ino(dentry)); mutex_unlock(&sbi->vh_mutex); return 0; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 5a400259ae74..9a1a93e3888b 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -696,7 +696,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) return err; } - strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + strbuf = kzalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); if (!strbuf) { res = -ENOMEM; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index a73d27c4dd58..3eb747d26924 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -16,11 +16,16 @@ #include <linux/seq_file.h> #include <linux/writeback.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/namei.h> #include "hostfs.h" #include <init.h> #include <kern.h> +struct hostfs_fs_info { + char *host_root_path; +}; + struct hostfs_inode_info { int fd; fmode_t mode; @@ -90,8 +95,10 @@ static char *__dentry_name(struct dentry *dentry, char *name) char *p = dentry_path_raw(dentry, name, PATH_MAX); char *root; size_t len; + struct hostfs_fs_info *fsi; - root = dentry->d_sb->s_fs_info; + fsi = dentry->d_sb->s_fs_info; + root = fsi->host_root_path; len = strlen(root); if (IS_ERR(p)) { __putname(name); @@ -196,8 +203,10 @@ static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) long long f_bavail; long long f_files; long long f_ffree; + struct hostfs_fs_info *fsi; - err = do_statfs(dentry->d_sb->s_fs_info, + fsi = dentry->d_sb->s_fs_info; + err = do_statfs(fsi->host_root_path, &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), &sf->f_namelen); @@ -245,7 +254,11 @@ static void hostfs_free_inode(struct inode *inode) static int hostfs_show_options(struct seq_file *seq, struct dentry *root) { - const char *root_path = root->d_sb->s_fs_info; + struct hostfs_fs_info *fsi; + const char *root_path; + + fsi = root->d_sb->s_fs_info; + root_path = fsi->host_root_path; size_t offset = strlen(root_ino) + 1; if (strlen(root_path) > offset) @@ -432,31 +445,20 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) static int hostfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; char *buffer; - loff_t start = page_offset(page); + loff_t start = folio_pos(folio); int bytes_read, ret = 0; - buffer = kmap_local_page(page); + buffer = kmap_local_folio(folio, 0); bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, PAGE_SIZE); - if (bytes_read < 0) { - ClearPageUptodate(page); - SetPageError(page); + if (bytes_read < 0) ret = bytes_read; - goto out; - } - - memset(buffer + bytes_read, 0, PAGE_SIZE - bytes_read); - - ClearPageError(page); - SetPageUptodate(page); - - out: - flush_dcache_page(page); + else + buffer = folio_zero_tail(folio, bytes_read, buffer); kunmap_local(buffer); - unlock_page(page); + folio_end_read(folio, ret == 0); return ret; } @@ -922,10 +924,11 @@ static const struct inode_operations hostfs_link_iops = { .get_link = hostfs_get_link, }; -static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) +static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct hostfs_fs_info *fsi = sb->s_fs_info; + const char *host_root = fc->source; struct inode *root_inode; - char *host_root_path, *req_root = d; int err; sb->s_blocksize = 1024; @@ -939,15 +942,15 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) return err; /* NULL is printed as '(null)' by printf(): avoid that. */ - if (req_root == NULL) - req_root = ""; + if (fc->source == NULL) + host_root = ""; - sb->s_fs_info = host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root); - if (host_root_path == NULL) + fsi->host_root_path = + kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); + if (fsi->host_root_path == NULL) return -ENOMEM; - root_inode = hostfs_iget(sb, host_root_path); + root_inode = hostfs_iget(sb, fsi->host_root_path); if (IS_ERR(root_inode)) return PTR_ERR(root_inode); @@ -955,7 +958,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) char *name; iput(root_inode); - name = follow_link(host_root_path); + name = follow_link(fsi->host_root_path); if (IS_ERR(name)) return PTR_ERR(name); @@ -972,11 +975,38 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) return 0; } -static struct dentry *hostfs_read_sb(struct file_system_type *type, - int flags, const char *dev_name, - void *data) +static int hostfs_fc_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, hostfs_fill_super); +} + +static void hostfs_fc_free(struct fs_context *fc) { - return mount_nodev(type, flags, data, hostfs_fill_sb_common); + struct hostfs_fs_info *fsi = fc->s_fs_info; + + if (!fsi) + return; + + kfree(fsi->host_root_path); + kfree(fsi); +} + +static const struct fs_context_operations hostfs_context_ops = { + .get_tree = hostfs_fc_get_tree, + .free = hostfs_fc_free, +}; + +static int hostfs_init_fs_context(struct fs_context *fc) +{ + struct hostfs_fs_info *fsi; + + fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); + if (!fsi) + return -ENOMEM; + + fc->s_fs_info = fsi; + fc->ops = &hostfs_context_ops; + return 0; } static void hostfs_kill_sb(struct super_block *s) @@ -986,11 +1016,11 @@ static void hostfs_kill_sb(struct super_block *s) } static struct file_system_type hostfs_type = { - .owner = THIS_MODULE, - .name = "hostfs", - .mount = hostfs_read_sb, - .kill_sb = hostfs_kill_sb, - .fs_flags = 0, + .owner = THIS_MODULE, + .name = "hostfs", + .init_fs_context = hostfs_init_fs_context, + .kill_sb = hostfs_kill_sb, + .fs_flags = 0, }; MODULE_ALIAS_FS("hostfs"); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 9184b4584b01..d0edf9ed33b6 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -472,9 +472,8 @@ out: static int hpfs_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - char *link = page_address(page); - struct inode *i = page->mapping->host; + char *link = folio_address(folio); + struct inode *i = folio->mapping->host; struct fnode *fnode; struct buffer_head *bh; int err; @@ -485,17 +484,9 @@ static int hpfs_symlink_read_folio(struct file *file, struct folio *folio) goto fail; err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE); brelse(bh); - if (err) - goto fail; - hpfs_unlock(i->i_sb); - SetPageUptodate(page); - unlock_page(page); - return 0; - fail: hpfs_unlock(i->i_sb); - SetPageError(page); - unlock_page(page); + folio_end_read(folio, err == 0); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 314834a078e9..e73717daa5f9 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -793,4 +793,5 @@ static void __exit exit_hpfs_fs(void) module_init(init_hpfs_fs) module_exit(exit_hpfs_fs) +MODULE_DESCRIPTION("OS/2 HPFS file system support"); MODULE_LICENSE("GPL"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 412f295acebe..81dab95f67ed 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -73,13 +73,13 @@ enum hugetlb_param { }; static const struct fs_parameter_spec hugetlb_fs_parameters[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_string("min_size", Opt_min_size), fsparam_u32oct("mode", Opt_mode), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("pagesize", Opt_pagesize), fsparam_string("size", Opt_size), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), {} }; @@ -1376,15 +1376,11 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par switch (opt) { case Opt_uid: - ctx->uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(ctx->uid)) - goto bad_val; + ctx->uid = result.uid; return 0; case Opt_gid: - ctx->gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(ctx->gid)) - goto bad_val; + ctx->gid = result.gid; return 0; case Opt_mode: diff --git a/fs/inode.c b/fs/inode.c index 3a41f83a4ba5..f356fe2ec2b6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -162,6 +162,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; + inode->i_state = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; @@ -231,6 +232,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) if (unlikely(security_inode_alloc(inode))) return -ENOMEM; + this_cpu_inc(nr_inodes); return 0; @@ -886,36 +888,45 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } -static void __wait_on_freeing_inode(struct inode *inode); +static void __wait_on_freeing_inode(struct inode *inode, bool locked); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data) + void *data, bool locked) { struct inode *inode = NULL; + if (locked) + lockdep_assert_held(&inode_hash_lock); + else + lockdep_assert_not_held(&inode_hash_lock); + + rcu_read_lock(); repeat: - hlist_for_each_entry(inode, head, i_hash) { + hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode); + __wait_on_freeing_inode(inode, locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); + rcu_read_unlock(); return inode; } + rcu_read_unlock(); return NULL; } @@ -924,29 +935,39 @@ repeat: * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) + struct hlist_head *head, unsigned long ino, + bool locked) { struct inode *inode = NULL; + if (locked) + lockdep_assert_held(&inode_hash_lock); + else + lockdep_assert_not_held(&inode_hash_lock); + + rcu_read_lock(); repeat: - hlist_for_each_entry(inode, head, i_hash) { + hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode); + __wait_on_freeing_inode(inode, locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); + rcu_read_unlock(); return inode; } + rcu_read_unlock(); return NULL; } @@ -1004,14 +1025,7 @@ EXPORT_SYMBOL(get_next_ino); */ struct inode *new_inode_pseudo(struct super_block *sb) { - struct inode *inode = alloc_inode(sb); - - if (inode) { - spin_lock(&inode->i_lock); - inode->i_state = 0; - spin_unlock(&inode->i_lock); - } - return inode; + return alloc_inode(sb); } /** @@ -1161,7 +1175,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, again: spin_lock(&inode_hash_lock); - old = find_inode(inode->i_sb, head, test, data); + old = find_inode(inode->i_sb, head, test, data, true); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. @@ -1235,7 +1249,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, struct inode *new = alloc_inode(sb); if (new) { - new->i_state = 0; inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); @@ -1246,6 +1259,47 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, EXPORT_SYMBOL(iget5_locked); /** + * iget5_locked_rcu - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is equivalent to iget5_locked, except the @test callback must + * tolerate the inode not being stable, including being mid-teardown. + */ +struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode, *new; + +again: + inode = find_inode(sb, head, test, data, false); + if (inode) { + if (IS_ERR(inode)) + return NULL; + wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + return inode; + } + + new = alloc_inode(sb); + if (new) { + inode = inode_insert5(new, hashval, test, set, data); + if (unlikely(inode != new)) + destroy_inode(new); + } + return inode; +} +EXPORT_SYMBOL_GPL(iget5_locked_rcu); + +/** * iget_locked - obtain an inode from a mounted file system * @sb: super block of file system * @ino: inode number to get @@ -1263,9 +1317,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); - spin_unlock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; @@ -1283,7 +1335,7 @@ again: spin_lock(&inode_hash_lock); /* We released the lock, so.. */ - old = find_inode_fast(sb, head, ino); + old = find_inode_fast(sb, head, ino, true); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); @@ -1419,7 +1471,7 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, struct inode *inode; spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data); + inode = find_inode(sb, head, test, data, true); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; @@ -1474,7 +1526,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) struct inode *inode; again: spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); + inode = find_inode_fast(sb, head, ino, true); spin_unlock(&inode_hash_lock); if (inode) { @@ -2235,17 +2287,21 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ -static void __wait_on_freeing_inode(struct inode *inode) +static void __wait_on_freeing_inode(struct inode *inode, bool locked) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + rcu_read_unlock(); + if (locked) + spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq, &wait.wq_entry); - spin_lock(&inode_hash_lock); + if (locked) + spin_lock(&inode_hash_lock); + rcu_read_lock(); } static __initdata unsigned long ihash_entries; @@ -2538,6 +2594,7 @@ bool in_group_or_capable(struct mnt_idmap *idmap, return true; return false; } +EXPORT_SYMBOL(in_group_or_capable); /** * mode_strip_sgid - handle the sgid bit for non-directories diff --git a/fs/internal.h b/fs/internal.h index ab2225136f60..cdd73209eecb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -17,6 +17,7 @@ struct fs_context; struct pipe_inode_info; struct iov_iter; struct mnt_idmap; +struct ns_common; /* * block/bdev.c @@ -239,6 +240,7 @@ extern void mnt_pin_kill(struct mount *m); * fs/nsfs.c */ extern const struct dentry_operations ns_dentry_operations; +int open_namespace(struct ns_common *ns); /* * fs/stat.c: @@ -247,6 +249,8 @@ extern const struct dentry_operations ns_dentry_operations; int getname_statx_lookup_flags(int flags); int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); +int do_statx_fd(int fd, unsigned int flags, unsigned int mask, + struct statx __user *buffer); /* * fs/splice.c: @@ -321,3 +325,15 @@ struct stashed_operations { int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +/** + * path_mounted - check whether path is mounted + * @path: path to check + * + * Determine whether @path refers to the root of a mount. + * + * Return: true if @path is the root of a mount, false if not. + */ +static inline bool path_mounted(const struct path *path) +{ + return path->mnt->mnt_root == path->dentry; +} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d46558990279..f420c53d86ac 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -307,8 +307,6 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off, spin_unlock_irqrestore(&ifs->state_lock, flags); } - if (error) - folio_set_error(folio); if (finished) folio_end_read(folio, uptodate); } @@ -444,6 +442,24 @@ done: return pos - orig_pos + plen; } +static loff_t iomap_read_folio_iter(const struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) +{ + struct folio *folio = ctx->cur_folio; + size_t offset = offset_in_folio(folio, iter->pos); + loff_t length = min_t(loff_t, folio_size(folio) - offset, + iomap_length(iter)); + loff_t done, ret; + + for (done = 0; done < length; done += ret) { + ret = iomap_readpage_iter(iter, ctx, done); + if (ret <= 0) + return ret; + } + + return done; +} + int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) { struct iomap_iter iter = { @@ -459,10 +475,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_readpage_iter(&iter, &ctx, 0); - - if (ret < 0) - folio_set_error(folio); + iter.processed = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); @@ -698,7 +711,6 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (folio_test_uptodate(folio)) return 0; - folio_clear_error(folio); do { iomap_adjust_read_range(iter->inode, folio, &block_start, @@ -878,37 +890,22 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t old_size = iter->inode->i_size; - size_t written; if (srcmap->type == IOMAP_INLINE) { iomap_write_end_inline(iter, folio, pos, copied); - written = copied; - } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, &folio->page, NULL); - WARN_ON_ONCE(written != copied && written != 0); - } else { - written = __iomap_write_end(iter->inode, pos, len, copied, - folio) ? copied : 0; + return true; } - /* - * Update the in-memory inode size after copying the data into the page - * cache. It's up to the file system to write the updated size to disk, - * preferably after I/O completion so that no stale data is exposed. - * Only once that's done can we unlock and release the folio. - */ - if (pos + written > old_size) { - i_size_write(iter->inode, pos + written); - iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; - } - __iomap_put_folio(iter, pos, written, folio); + if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { + size_t bh_written; - if (old_size < pos) - pagecache_isize_extended(iter->inode, old_size, pos); + bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, + len, copied, &folio->page, NULL); + WARN_ON_ONCE(bh_written != copied && bh_written != 0); + return bh_written == copied; + } - return written == copied; + return __iomap_write_end(iter->inode, pos, len, copied, folio); } static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) @@ -923,6 +920,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) do { struct folio *folio; + loff_t old_size; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ @@ -974,6 +972,23 @@ retry: written = iomap_write_end(iter, pos, bytes, copied, folio) ? copied : 0; + /* + * Update the in-memory inode size after copying the data into + * the page cache. It's up to the file system to write the + * updated size to disk, preferably after I/O completion so that + * no stale data is exposed. Only once that's done can we + * unlock and release the folio. + */ + old_size = iter->inode->i_size; + if (pos + written > old_size) { + i_size_write(iter->inode, pos + written); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; + } + __iomap_put_folio(iter, pos, written, folio); + + if (old_size < pos) + pagecache_isize_extended(iter->inode, old_size, pos); + cond_resched(); if (unlikely(written == 0)) { /* @@ -1344,6 +1359,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) bytes = folio_size(folio) - offset; ret = iomap_write_end(iter, pos, bytes, bytes, folio); + __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; @@ -1409,6 +1425,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) folio_mark_accessed(folio); ret = iomap_write_end(iter, pos, bytes, bytes, folio); + __iomap_put_folio(iter, pos, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; @@ -1539,8 +1556,6 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) /* walk all folios in bio, ending page IO on them */ bio_for_each_folio_all(fi, bio) { - if (error) - folio_set_error(fi.folio); iomap_finish_folio_write(inode, fi.folio, fi.length); folio_count++; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 93b1077a380a..ed548efdd9bb 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -326,8 +326,8 @@ static const struct fs_parameter_spec isofs_param_spec[] = { fsparam_u32 ("session", Opt_session), fsparam_u32 ("sbsector", Opt_sb), fsparam_enum ("check", Opt_check, isofs_param_check), - fsparam_u32 ("uid", Opt_uid), - fsparam_u32 ("gid", Opt_gid), + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), /* Note: mode/dmode historically accepted %u not strictly %o */ fsparam_u32 ("mode", Opt_mode), fsparam_u32 ("dmode", Opt_dmode), @@ -344,8 +344,6 @@ static int isofs_parse_param(struct fs_context *fc, struct isofs_options *popt = fc->fs_private; struct fs_parse_result result; int opt; - kuid_t uid; - kgid_t gid; unsigned int n; /* There are no remountable options */ @@ -409,17 +407,11 @@ static int isofs_parse_param(struct fs_context *fc, case Opt_ignore: break; case Opt_uid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - return -EINVAL; - popt->uid = uid; + popt->uid = result.uid; popt->uid_set = 1; break; case Opt_gid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) - return -EINVAL; - popt->gid = gid; + popt->gid = result.gid; popt->gid_set = 1; break; case Opt_mode: diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index d6c17ad69dee..dbf911126e61 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -688,11 +688,10 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode, */ static int rock_ridge_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct iso_inode_info *ei = ISOFS_I(inode); struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); - char *link = page_address(page); + char *link = folio_address(folio); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); struct buffer_head *bh; char *rpnt = link; @@ -779,9 +778,10 @@ repeat: goto fail; brelse(bh); *rpnt = '\0'; - SetPageUptodate(page); - unlock_page(page); - return 0; + ret = 0; +end: + folio_end_read(folio, ret == 0); + return ret; /* error exit from macro */ out: @@ -795,9 +795,8 @@ out_bad_span: fail: brelse(bh); error: - SetPageError(page); - unlock_page(page); - return -EIO; + ret = -EIO; + goto end; } const struct address_space_operations isofs_symlink_aops = { diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 62ea76da7fdf..e12cb145147e 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -95,13 +95,8 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT, PAGE_SIZE); - if (ret) { - ClearPageUptodate(pg); - SetPageError(pg); - } else { + if (!ret) SetPageUptodate(pg); - ClearPageError(pg); - } flush_dcache_page(pg); kunmap(pg); @@ -304,10 +299,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, kunmap(pg); - if (ret) { - /* There was an error writing. */ - SetPageError(pg); - } + if (ret) + mapping_set_error(mapping, ret); /* Adjust writtenlen for the padding we did, so we don't confuse our caller */ writtenlen -= min(writtenlen, (start - aligned_start)); @@ -330,7 +323,6 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, it gets reread */ jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n", __func__); - SetPageError(pg); ClearPageUptodate(pg); } diff --git a/fs/libfs.c b/fs/libfs.c index b635ee5adbcc..8aa34870449f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1854,6 +1854,80 @@ static const struct dentry_operations generic_ci_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, #endif }; + +/** + * generic_ci_match() - Match a name (case-insensitively) with a dirent. + * This is a filesystem helper for comparison with directory entries. + * generic_ci_d_compare should be used in VFS' ->d_compare instead. + * + * @parent: Inode of the parent of the dirent under comparison + * @name: name under lookup. + * @folded_name: Optional pre-folded name under lookup + * @de_name: Dirent name. + * @de_name_len: dirent name length. + * + * Test whether a case-insensitive directory entry matches the filename + * being searched. If @folded_name is provided, it is used instead of + * recalculating the casefold of @name. + * + * Return: > 0 if the directory entry matches, 0 if it doesn't match, or + * < 0 on error. + */ +int generic_ci_match(const struct inode *parent, + const struct qstr *name, + const struct qstr *folded_name, + const u8 *de_name, u32 de_name_len) +{ + const struct super_block *sb = parent->i_sb; + const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); + struct qstr dirent = QSTR_INIT(de_name, de_name_len); + int res = 0; + + if (IS_ENCRYPTED(parent)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT((u8 *) de_name, de_name_len); + + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent))) + return -EINVAL; + + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, + &decrypted_name); + if (res < 0) { + kfree(decrypted_name.name); + return res; + } + dirent.name = decrypted_name.name; + dirent.len = decrypted_name.len; + } + + /* + * Attempt a case-sensitive match first. It is cheaper and + * should cover most lookups, including all the sane + * applications that expect a case-sensitive filesystem. + */ + + if (dirent.len == name->len && + !memcmp(name->name, dirent.name, dirent.len)) + goto out; + + if (folded_name->name) + res = utf8_strncasecmp_folded(um, folded_name, &dirent); + else + res = utf8_strncasecmp(um, name, &dirent); + +out: + kfree(decrypted_name.name); + if (res < 0 && sb_has_strict_encoding(sb)) { + pr_err_ratelimited("Directory contains filename that is invalid UTF-8"); + return 0; + } + return !res; +} +EXPORT_SYMBOL(generic_ci_match); #endif #ifdef CONFIG_FS_ENCRYPTION diff --git a/fs/locks.c b/fs/locks.c index 90c8746874de..bdd94c32256f 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1367,9 +1367,9 @@ retry: locks_wake_up_blocks(&left->c); } out: + trace_posix_lock_inode(inode, request, error); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); - trace_posix_lock_inode(inode, request, error); /* * Free any unused locks. */ @@ -2448,8 +2448,9 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, error = do_lock_file_wait(filp, cmd, file_lock); /* - * Attempt to detect a close/fcntl race and recover by releasing the - * lock that was just acquired. There is no need to do that when we're + * Detect close/fcntl races and recover by zapping all POSIX locks + * associated with this file and our files_struct, just like on + * filp_flush(). There is no need to do that when we're * unlocking though, or for OFD locks. */ if (!error && file_lock->c.flc_type != F_UNLCK && @@ -2464,9 +2465,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { - file_lock->c.flc_type = F_UNLCK; - error = do_lock_file_wait(filp, cmd, file_lock); - WARN_ON_ONCE(error); + locks_remove_posix(filp, files); error = -EBADF; } } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 7f9a2d8aa420..1c3df63162ef 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -730,5 +730,6 @@ static void __exit exit_minix_fs(void) module_init(init_minix_fs) module_exit(exit_minix_fs) +MODULE_DESCRIPTION("Minix file system"); MODULE_LICENSE("GPL"); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index d6031acc34f0..a944a0f17b53 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -213,8 +213,7 @@ static int minix_rename(struct mnt_idmap *idmap, if (!new_de) goto out_dir; err = minix_set_link(new_de, new_page, old_inode); - kunmap(new_page); - put_page(new_page); + unmap_and_put_page(new_page, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); diff --git a/fs/mount.h b/fs/mount.h index 4a42fc68f4cc..ad4b1ddebb54 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,6 +16,8 @@ struct mnt_namespace { u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; struct mnt_pcp { @@ -152,3 +154,4 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) } extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); +bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/mpage.c b/fs/mpage.c index fa8b99a199fa..b5b5ddf9d513 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -48,13 +48,8 @@ static void mpage_read_end_io(struct bio *bio) struct folio_iter fi; int err = blk_status_to_errno(bio->bi_status); - bio_for_each_folio_all(fi, bio) { - if (err) - folio_set_error(fi.folio); - else - folio_mark_uptodate(fi.folio); - folio_unlock(fi.folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, err == 0); bio_put(bio); } @@ -65,10 +60,8 @@ static void mpage_write_end_io(struct bio *bio) int err = blk_status_to_errno(bio->bi_status); bio_for_each_folio_all(fi, bio) { - if (err) { - folio_set_error(fi.folio); + if (err) mapping_set_error(fi.folio->mapping, err); - } folio_end_writeback(fi.folio); } diff --git a/fs/namei.c b/fs/namei.c index 37fb0a8aa09a..3a4c40e12f78 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -126,7 +126,7 @@ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * -getname_flags(const char __user *filename, int flags, int *empty) +getname_flags(const char __user *filename, int flags) { struct filename *result; char *kname; @@ -148,9 +148,20 @@ getname_flags(const char __user *filename, int flags, int *empty) result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); - if (unlikely(len < 0)) { - __putname(result); - return ERR_PTR(len); + /* + * Handle both empty path and copy failure in one go. + */ + if (unlikely(len <= 0)) { + if (unlikely(len < 0)) { + __putname(result); + return ERR_PTR(len); + } + + /* The empty path is special. */ + if (!(flags & LOOKUP_EMPTY)) { + __putname(result); + return ERR_PTR(-ENOENT); + } } /* @@ -180,6 +191,12 @@ getname_flags(const char __user *filename, int flags, int *empty) kfree(result); return ERR_PTR(len); } + /* The empty path is special. */ + if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { + __putname(kname); + kfree(result); + return ERR_PTR(-ENOENT); + } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); @@ -188,16 +205,6 @@ getname_flags(const char __user *filename, int flags, int *empty) } atomic_set(&result->refcnt, 1); - /* The empty path is special. */ - if (unlikely(!len)) { - if (empty) - *empty = 1; - if (!(flags & LOOKUP_EMPTY)) { - putname(result); - return ERR_PTR(-ENOENT); - } - } - result->uptr = filename; result->aname = NULL; audit_getname(result); @@ -209,13 +216,13 @@ getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; - return getname_flags(filename, flags, NULL); + return getname_flags(filename, flags); } struct filename * getname(const char __user * filename) { - return getname_flags(filename, 0, NULL); + return getname_flags(filename, 0); } struct filename * @@ -1233,29 +1240,48 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link) * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(struct mnt_idmap *idmap, - struct nameidata *nd, struct inode *const inode) +static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, + struct inode *const inode) { umode_t dir_mode = nd->dir_mode; - vfsuid_t dir_vfsuid = nd->dir_vfsuid; + vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; - if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || - (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || - likely(!(dir_mode & S_ISVTX)) || - vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) || - vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) + if (likely(!(dir_mode & S_ISVTX))) return 0; - if (likely(dir_mode & 0002) || - (dir_mode & 0020 && - ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || - (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { - const char *operation = S_ISFIFO(inode->i_mode) ? - "sticky_create_fifo" : - "sticky_create_regular"; - audit_log_path_denied(AUDIT_ANOM_CREAT, operation); + if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) + return 0; + + if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) + return 0; + + i_vfsuid = i_uid_into_vfsuid(idmap, inode); + + if (vfsuid_eq(i_vfsuid, dir_vfsuid)) + return 0; + + if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) + return 0; + + if (likely(dir_mode & 0002)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); return -EACCES; } + + if (dir_mode & 0020) { + if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, + "sticky_create_fifo"); + return -EACCES; + } + + if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { + audit_log_path_denied(AUDIT_ANOM_CREAT, + "sticky_create_regular"); + return -EACCES; + } + } + return 0; } @@ -1712,17 +1738,26 @@ static struct dentry *lookup_slow(const struct qstr *name, } static inline int may_lookup(struct mnt_idmap *idmap, - struct nameidata *nd) + struct nameidata *restrict nd) { - if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (!err) // success, keep going - return 0; - if (!try_to_unlazy(nd)) - return -ECHILD; // redo it all non-lazy - if (err != -ECHILD) // hard error - return err; - } + int err, mask; + + mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; + err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); + if (likely(!err)) + return 0; + + // If we failed, and we weren't in LOOKUP_RCU, it's final + if (!(nd->flags & LOOKUP_RCU)) + return err; + + // Drop out of RCU mode to make sure it wasn't transient + if (!try_to_unlazy(nd)) + return -ECHILD; // redo it all non-lazy + + if (err != -ECHILD) // hard error + return err; + return inode_permission(idmap, nd->inode, MAY_EXEC); } @@ -2163,21 +2198,39 @@ EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and - * return the "hash_len" as the result. + * return the length as the result. */ -static inline u64 hash_name(const void *salt, const char *name) +static inline const char *hash_name(struct nameidata *nd, + const char *name, + unsigned long *lastword) { - unsigned long a = 0, b, x = 0, y = (unsigned long)salt; + unsigned long a, b, x, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - len = 0; - goto inside; + /* + * The first iteration is special, because it can result in + * '.' and '..' and has no mixing other than the final fold. + */ + a = load_unaligned_zeropad(name); + b = a ^ REPEAT_BYTE('/'); + if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + mask = create_zero_mask(adata | bdata); + a &= zero_bytemask(mask); + *lastword = a; + len = find_zero(mask); + nd->last.hash = fold_hash(a, y); + nd->last.len = len; + return name + len; + } + len = 0; + x = 0; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); -inside: a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); @@ -2185,11 +2238,25 @@ inside: adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); - x ^= a & zero_bytemask(mask); + a &= zero_bytemask(mask); + x ^= a; + len += find_zero(mask); + *lastword = 0; // Multi-word components cannot be DOT or DOTDOT - return hashlen_create(fold_hash(x, y), len + find_zero(mask)); + nd->last.hash = fold_hash(x, y); + nd->last.len = len; + return name + len; } +/* + * Note that the 'last' word is always zero-masked, but + * was loaded as a possibly big-endian word. + */ +#ifdef __BIG_ENDIAN + #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) + #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) +#endif + #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ @@ -2222,22 +2289,35 @@ EXPORT_SYMBOL(hashlen_string); * We know there's a real path component here of at least * one character. */ -static inline u64 hash_name(const void *salt, const char *name) +static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { - unsigned long hash = init_name_hash(salt); - unsigned long len = 0, c; + unsigned long hash = init_name_hash(nd->path.dentry); + unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { + last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - return hashlen_create(end_name_hash(hash), len); + + // This is reliable for DOT or DOTDOT, since the component + // cannot contain NUL characters - top bits being zero means + // we cannot have had any other pathnames. + *lastword = last; + nd->last.hash = end_name_hash(hash); + nd->last.len = len; + return name + len; } #endif +#ifndef LAST_WORD_IS_DOT + #define LAST_WORD_IS_DOT 0x2e + #define LAST_WORD_IS_DOTDOT 0x2e2e +#endif + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -2266,45 +2346,38 @@ static int link_path_walk(const char *name, struct nameidata *nd) for(;;) { struct mnt_idmap *idmap; const char *link; - u64 hash_len; - int type; + unsigned long lastword; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); if (err) return err; - hash_len = hash_name(nd->path.dentry, name); + nd->last.name = name; + name = hash_name(nd, name, &lastword); - type = LAST_NORM; - if (name[0] == '.') switch (hashlen_len(hash_len)) { - case 2: - if (name[1] == '.') { - type = LAST_DOTDOT; - nd->state |= ND_JUMPED; - } - break; - case 1: - type = LAST_DOT; - } - if (likely(type == LAST_NORM)) { - struct dentry *parent = nd->path.dentry; + switch(lastword) { + case LAST_WORD_IS_DOTDOT: + nd->last_type = LAST_DOTDOT; + nd->state |= ND_JUMPED; + break; + + case LAST_WORD_IS_DOT: + nd->last_type = LAST_DOT; + break; + + default: + nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED; + + struct dentry *parent = nd->path.dentry; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - struct qstr this = { { .hash_len = hash_len }, .name = name }; - err = parent->d_op->d_hash(parent, &this); + err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) return err; - hash_len = this.hash_len; - name = this.name; } } - nd->last.hash_len = hash_len; - nd->last.name = name; - nd->last_type = type; - - name += hashlen_len(hash_len); if (!*name) goto OK; /* @@ -2922,16 +2995,16 @@ int path_pts(struct path *path) } #endif -int user_path_at_empty(int dfd, const char __user *name, unsigned flags, - struct path *path, int *empty) +int user_path_at(int dfd, const char __user *name, unsigned flags, + struct path *path) { - struct filename *filename = getname_flags(name, flags, empty); + struct filename *filename = getname_flags(name, flags); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } -EXPORT_SYMBOL(user_path_at_empty); +EXPORT_SYMBOL(user_path_at); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) @@ -3572,8 +3645,12 @@ static const char *open_last_lookups(struct nameidata *nd, else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); - if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED)) - fsnotify_create(dir->d_inode, dentry); + if (!IS_ERR(dentry)) { + if (file->f_mode & FMODE_CREATED) + fsnotify_create(dir->d_inode, dentry); + if (file->f_mode & FMODE_OPENED) + fsnotify_open(file); + } if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else @@ -3700,6 +3777,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap, mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); + if (file->f_mode & FMODE_OPENED) + fsnotify_open(file); if (error) return error; /* Don't check for other permissions, the inode was just created */ diff --git a/fs/namespace.c b/fs/namespace.c index 5a51315c6678..221db9de4729 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -70,7 +70,8 @@ static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ -static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32); +#define MNT_UNIQUE_ID_OFFSET (1ULL << 32) +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET); static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; @@ -78,6 +79,8 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static DEFINE_RWLOCK(mnt_ns_tree_lock); +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ struct mount_kattr { unsigned int attr_set; @@ -103,6 +106,109 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) +{ + u64 seq_b = ns->seq; + + if (seq < seq_b) + return -1; + if (seq > seq_b) + return 1; + return 0; +} + +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); +} + +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +{ + struct mnt_namespace *ns_a = node_to_mnt_ns(a); + struct mnt_namespace *ns_b = node_to_mnt_ns(b); + u64 seq_a = ns_a->seq; + + return mnt_ns_cmp(seq_a, ns_b) < 0; +} + +static void mnt_ns_tree_add(struct mnt_namespace *ns) +{ + guard(write_lock)(&mnt_ns_tree_lock); + rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); +} + +static void mnt_ns_release(struct mnt_namespace *ns) +{ + lockdep_assert_not_held(&mnt_ns_tree_lock); + + /* keep alive for {list,stat}mount() */ + if (refcount_dec_and_test(&ns->passive)) { + put_user_ns(ns->user_ns); + kfree(ns); + } +} +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) + +static void mnt_ns_tree_remove(struct mnt_namespace *ns) +{ + /* remove from global mount namespace list */ + if (!is_anon_ns(ns)) { + guard(write_lock)(&mnt_ns_tree_lock); + rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + } + + mnt_ns_release(ns); +} + +/* + * Returns the mount namespace which either has the specified id, or has the + * next smallest id afer the specified one. + */ +static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +{ + struct rb_node *node = mnt_ns_tree.rb_node; + struct mnt_namespace *ret = NULL; + + lockdep_assert_held(&mnt_ns_tree_lock); + + while (node) { + struct mnt_namespace *n = node_to_mnt_ns(node); + + if (mnt_ns_id <= n->seq) { + ret = node_to_mnt_ns(node); + if (mnt_ns_id == n->seq) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + return ret; +} + +/* + * Lookup a mount namespace by id and take a passive reference count. Taking a + * passive reference means the mount namespace can be emptied if e.g., the last + * task holding an active reference exits. To access the mounts of the + * namespace the @namespace_sem must first be acquired. If the namespace has + * already shut down before acquiring @namespace_sem, {list,stat}mount() will + * see that the mount rbtree of the namespace is empty. + */ +static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) +{ + struct mnt_namespace *ns; + + guard(read_lock)(&mnt_ns_tree_lock); + ns = mnt_ns_find_id_at(mnt_ns_id); + if (!ns || ns->seq != mnt_ns_id) + return NULL; + + refcount_inc(&ns->passive); + return ns; +} + static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); @@ -1448,6 +1554,30 @@ static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) return ret; } +/* + * Returns the mount which either has the specified mnt_id, or has the next + * greater id before the specified one. + */ +static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id) +{ + struct rb_node *node = ns->mounts.rb_node; + struct mount *ret = NULL; + + while (node) { + struct mount *m = node_to_mount(node); + + if (mnt_id >= m->mnt_id_unique) { + ret = node_to_mount(node); + if (mnt_id == m->mnt_id_unique) + break; + node = node->rb_right; + } else { + node = node->rb_left; + } + } + return ret; +} + #ifdef CONFIG_PROC_FS /* iterator; we want it to have access to namespace_sem, thus here... */ @@ -1846,19 +1976,6 @@ bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } -/** - * path_mounted - check whether path is mounted - * @path: path to check - * - * Determine whether @path refers to the root of a mount. - * - * Return: true if @path is the root of a mount, false if not. - */ -static inline bool path_mounted(const struct path *path) -{ - return path->mnt->mnt_root == path->dentry; -} - static void warn_mandlock(void) { pr_warn_once("=======================================================\n" @@ -1966,69 +2083,72 @@ static bool mnt_ns_loop(struct dentry *dentry) return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } -struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, +struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, int flag) { - struct mount *res, *p, *q, *r, *parent; + struct mount *res, *src_parent, *src_root_child, *src_mnt, + *dst_parent, *dst_mnt; - if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root)) return ERR_PTR(-EINVAL); if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); - res = q = clone_mnt(mnt, dentry, flag); - if (IS_ERR(q)) - return q; + res = dst_mnt = clone_mnt(src_root, dentry, flag); + if (IS_ERR(dst_mnt)) + return dst_mnt; - q->mnt_mountpoint = mnt->mnt_mountpoint; + src_parent = src_root; + dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint; - p = mnt; - list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { - struct mount *s; - if (!is_subdir(r->mnt_mountpoint, dentry)) + list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { + if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) continue; - for (s = r; s; s = next_mnt(s, r)) { + for (src_mnt = src_root_child; src_mnt; + src_mnt = next_mnt(src_mnt, src_root_child)) { if (!(flag & CL_COPY_UNBINDABLE) && - IS_MNT_UNBINDABLE(s)) { - if (s->mnt.mnt_flags & MNT_LOCKED) { + IS_MNT_UNBINDABLE(src_mnt)) { + if (src_mnt->mnt.mnt_flags & MNT_LOCKED) { /* Both unbindable and locked. */ - q = ERR_PTR(-EPERM); + dst_mnt = ERR_PTR(-EPERM); goto out; } else { - s = skip_mnt_tree(s); + src_mnt = skip_mnt_tree(src_mnt); continue; } } if (!(flag & CL_COPY_MNT_NS_FILE) && - is_mnt_ns_file(s->mnt.mnt_root)) { - s = skip_mnt_tree(s); + is_mnt_ns_file(src_mnt->mnt.mnt_root)) { + src_mnt = skip_mnt_tree(src_mnt); continue; } - while (p != s->mnt_parent) { - p = p->mnt_parent; - q = q->mnt_parent; + while (src_parent != src_mnt->mnt_parent) { + src_parent = src_parent->mnt_parent; + dst_mnt = dst_mnt->mnt_parent; } - p = s; - parent = q; - q = clone_mnt(p, p->mnt.mnt_root, flag); - if (IS_ERR(q)) + + src_parent = src_mnt; + dst_parent = dst_mnt; + dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag); + if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); - list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp, false); + list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); + attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); unlock_mount_hash(); } } return res; + out: if (res) { lock_mount_hash(); umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } - return q; + return dst_mnt; } /* Caller should check returned pointer for errors */ @@ -2078,7 +2198,7 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } -static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +bool has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -3709,8 +3829,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - kfree(ns); + mnt_ns_tree_remove(ns); } /* @@ -3749,7 +3868,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); + refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; + RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -3826,6 +3947,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } + mnt_ns_tree_add(new_ns); namespace_unlock(); if (rootmnt) @@ -4843,6 +4965,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) +{ + s->sm.mask |= STATMOUNT_MNT_NS_ID; + s->sm.mnt_ns_id = ns->seq; +} + +static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + int err; + + if (sb->s_op->show_options) { + size_t start = seq->count; + + err = sb->s_op->show_options(seq, mnt->mnt_root); + if (err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + /* skip leading comma */ + memmove(seq->buf + start, seq->buf + start + 1, + seq->count - start - 1); + seq->count--; + } + + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret; @@ -4863,6 +5019,10 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->mnt_point = seq->count; ret = statmount_mnt_point(s, seq); break; + case STATMOUNT_MNT_OPTS: + sm->mnt_opts = seq->count; + ret = statmount_mnt_opts(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -4903,23 +5063,84 @@ static int copy_statmount_to_user(struct kstatmount *s) return 0; } -static int do_statmount(struct kstatmount *s) +static struct mount *listmnt_next(struct mount *curr, bool reverse) { - struct mount *m = real_mount(s->mnt); + struct rb_node *node; + + if (reverse) + node = rb_prev(&curr->mnt_node); + else + node = rb_next(&curr->mnt_node); + + return node_to_mount(node); +} + +static int grab_requested_root(struct mnt_namespace *ns, struct path *root) +{ + struct mount *first, *child; + + rwsem_assert_held(&namespace_sem); + + /* We're looking at our own ns, just use get_fs_root. */ + if (ns == current->nsproxy->mnt_ns) { + get_fs_root(current->fs, root); + return 0; + } + + /* + * We have to find the first mount in our ns and use that, however it + * may not exist, so handle that properly. + */ + if (RB_EMPTY_ROOT(&ns->mounts)) + return -ENOENT; + + first = child = ns->root; + for (;;) { + child = listmnt_next(child, false); + if (!child) + return -ENOENT; + if (child->mnt_parent == first) + break; + } + + root->mnt = mntget(&child->mnt); + root->dentry = dget(root->mnt->mnt_root); + return 0; +} + +static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, + struct mnt_namespace *ns) +{ + struct path root __free(path_put) = {}; + struct mount *m; int err; + /* Has the namespace already been emptied? */ + if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) + return -ENOENT; + + s->mnt = lookup_mnt_in_ns(mnt_id, ns); + if (!s->mnt) + return -ENOENT; + + err = grab_requested_root(ns, &root); + if (err) + return err; + /* * Don't trigger audit denials. We just want to determine what * mounts to show users. */ - if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && - !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + m = real_mount(s->mnt); + if (!is_path_reachable(m, m->mnt.mnt_root, &root) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; err = security_sb_statfs(s->mnt->mnt_root); if (err) return err; + s->root = root; if (s->mask & STATMOUNT_SB_BASIC) statmount_sb_basic(s); @@ -4938,6 +5159,12 @@ static int do_statmount(struct kstatmount *s) if (!err && s->mask & STATMOUNT_MNT_POINT) err = statmount_string(s, STATMOUNT_MNT_POINT); + if (!err && s->mask & STATMOUNT_MNT_OPTS) + err = statmount_string(s, STATMOUNT_MNT_OPTS); + + if (!err && s->mask & STATMOUNT_MNT_NS_ID) + statmount_mnt_ns_id(s, ns); + if (err) return err; @@ -4955,6 +5182,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) return true; } +#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS) + static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, size_t seq_size) @@ -4966,10 +5196,18 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, ks->mask = kreq->param; ks->buf = buf; ks->bufsize = bufsize; - ks->seq.size = seq_size; - ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); - if (!ks->seq.buf) - return -ENOMEM; + + if (ks->mask & STATMOUNT_STRING_REQ) { + if (bufsize == sizeof(ks->sm)) + return -EOVERFLOW; + + ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); + if (!ks->seq.buf) + return -ENOMEM; + + ks->seq.size = seq_size; + } + return 0; } @@ -4979,7 +5217,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, int ret; size_t usize; - BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1); ret = get_user(usize, &req->size); if (ret) @@ -4994,16 +5232,32 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, return ret; if (kreq->spare != 0) return -EINVAL; + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; return 0; } +/* + * If the user requested a specific mount namespace id, look that up and return + * that, or if not simply grab a passive reference on our mount namespace and + * return that. + */ +static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id) +{ + if (mnt_ns_id) + return lookup_mnt_ns(mnt_ns_id); + refcount_inc(¤t->nsproxy->mnt_ns->passive); + return current->nsproxy->mnt_ns; +} + SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) { - struct vfsmount *mnt; + struct mnt_namespace *ns __free(mnt_ns_release) = NULL; + struct kstatmount *ks __free(kfree) = NULL; struct mnt_id_req kreq; - struct kstatmount ks; /* We currently support retrieval of 3 strings. */ size_t seq_size = 3 * PATH_MAX; int ret; @@ -5015,64 +5269,88 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (ret) return ret; + ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + if (!ns) + return -ENOENT; + + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -ENOENT; + + ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); + if (!ks) + return -ENOMEM; + retry: - ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size); + ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size); if (ret) return ret; - down_read(&namespace_sem); - mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); - if (!mnt) { - up_read(&namespace_sem); - kvfree(ks.seq.buf); - return -ENOENT; - } - - ks.mnt = mnt; - get_fs_root(current->fs, &ks.root); - ret = do_statmount(&ks); - path_put(&ks.root); - up_read(&namespace_sem); + scoped_guard(rwsem_read, &namespace_sem) + ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); if (!ret) - ret = copy_statmount_to_user(&ks); - kvfree(ks.seq.buf); + ret = copy_statmount_to_user(ks); + kvfree(ks->seq.buf); if (retry_statmount(ret, &seq_size)) goto retry; return ret; } -static struct mount *listmnt_next(struct mount *curr) +static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, + u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, + bool reverse) { - return node_to_mount(rb_next(&curr->mnt_node)); -} - -static ssize_t do_listmount(struct mount *first, struct path *orig, - u64 mnt_parent_id, u64 __user *mnt_ids, - size_t nr_mnt_ids, const struct path *root) -{ - struct mount *r; + struct path root __free(path_put) = {}; + struct path orig; + struct mount *r, *first; ssize_t ret; + rwsem_assert_held(&namespace_sem); + + ret = grab_requested_root(ns, &root); + if (ret) + return ret; + + if (mnt_parent_id == LSMT_ROOT) { + orig = root; + } else { + orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); + if (!orig.mnt) + return -ENOENT; + orig.dentry = orig.mnt->mnt_root; + } + /* * Don't trigger audit denials. We just want to determine what * mounts to show users. */ - if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) && - !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; - ret = security_sb_statfs(orig->dentry); + ret = security_sb_statfs(orig.dentry); if (ret) return ret; - for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) { + if (!last_mnt_id) { + if (reverse) + first = node_to_mount(rb_last(&ns->mounts)); + else + first = node_to_mount(rb_first(&ns->mounts)); + } else { + if (reverse) + first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); + else + first = mnt_find_id_at(ns, last_mnt_id + 1); + } + + for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) { if (r->mnt_id_unique == mnt_parent_id) continue; - if (!is_path_reachable(r, r->mnt.mnt_root, orig)) + if (!is_path_reachable(r, r->mnt.mnt_root, &orig)) continue; - if (put_user(r->mnt_id_unique, mnt_ids)) - return -EFAULT; + *mnt_ids = r->mnt_id_unique; mnt_ids++; nr_mnt_ids--; ret++; @@ -5080,22 +5358,26 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, return ret; } -SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, - mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) +SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, + u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { - struct mnt_namespace *ns = current->nsproxy->mnt_ns; + u64 *kmnt_ids __free(kvfree) = NULL; + const size_t maxcount = 1000000; + struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; - struct mount *first; - struct path root, orig; - u64 mnt_parent_id, last_mnt_id; - const size_t maxcount = (size_t)-1 >> 3; + u64 last_mnt_id; ssize_t ret; - if (flags) + if (flags & ~LISTMOUNT_REVERSE) return -EINVAL; + /* + * If the mount namespace really has more than 1 million mounts the + * caller must iterate over the mount namespace (and reconsider their + * system design...). + */ if (unlikely(nr_mnt_ids > maxcount)) - return -EFAULT; + return -EOVERFLOW; if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) return -EFAULT; @@ -5103,33 +5385,37 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, ret = copy_mnt_id_req(req, &kreq); if (ret) return ret; - mnt_parent_id = kreq.mnt_id; + last_mnt_id = kreq.param; + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; - down_read(&namespace_sem); - get_fs_root(current->fs, &root); - if (mnt_parent_id == LSMT_ROOT) { - orig = root; - } else { - ret = -ENOENT; - orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); - if (!orig.mnt) - goto err; - orig.dentry = orig.mnt->mnt_root; - } - if (!last_mnt_id) - first = node_to_mount(rb_first(&ns->mounts)); - else - first = mnt_find_id_at(ns, last_mnt_id + 1); + kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids), + GFP_KERNEL_ACCOUNT); + if (!kmnt_ids) + return -ENOMEM; + + ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + if (!ns) + return -ENOENT; + + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -ENOENT; + + scoped_guard(rwsem_read, &namespace_sem) + ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, + nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); + if (ret <= 0) + return ret; + + if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) + return -EFAULT; - ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root); -err: - path_put(&root); - up_read(&namespace_sem); return ret; } - static void __init init_mount_tree(void) { struct vfsmount *mnt; @@ -5157,6 +5443,8 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); + + mnt_ns_tree_add(ns); } void __init mnt_init(void) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a6bb03bea920..4c0401dbbfcf 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -117,7 +117,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - _debug("no unlock"); + kdebug("no unlock"); else folio_unlock(folio); } @@ -204,7 +204,7 @@ void netfs_readahead(struct readahead_control *ractl) struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); int ret; - _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); + kenter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); if (readahead_count(ractl) == 0) return; @@ -268,7 +268,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) struct folio *sink = NULL; int ret; - _enter("%lx", folio->index); + kenter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_file_pos(folio), folio_size(folio), @@ -508,7 +508,7 @@ retry: have_folio: *_folio = folio; - _leave(" = 0"); + kleave(" = 0"); return 0; error_put: @@ -518,7 +518,7 @@ error: folio_unlock(folio); folio_put(folio); } - _leave(" = %d", ret); + kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(netfs_write_begin); @@ -536,7 +536,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t flen = folio_size(folio); int ret; - _enter("%zx @%llx", flen, start); + kenter("%zx @%llx", flen, start); ret = -ENOMEM; @@ -567,7 +567,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); error: - _leave(" = %d", ret); + kleave(" = %d", ret); return ret; } diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 07bc1fd43530..ecbc99ec7d36 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -56,7 +56,7 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, struct netfs_group *group = netfs_folio_group(folio); loff_t pos = folio_file_pos(folio); - _enter(""); + kenter(""); if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) return NETFS_FLUSH_CONTENT; @@ -272,12 +272,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, */ howto = netfs_how_to_modify(ctx, file, folio, netfs_group, flen, offset, part, maybe_trouble); - _debug("howto %u", howto); + kdebug("howto %u", howto); switch (howto) { case NETFS_JUST_PREFETCH: ret = netfs_prefetch_for_write(file, folio, offset, part); if (ret < 0) { - _debug("prefetch = %zd", ret); + kdebug("prefetch = %zd", ret); goto error_folio_unlock; } break; @@ -418,7 +418,7 @@ out: } iocb->ki_pos += written; - _leave(" = %zd [%zd]", written, ret); + kleave(" = %zd [%zd]", written, ret); return written ? written : ret; error_folio_unlock: @@ -491,7 +491,7 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct netfs_inode *ictx = netfs_inode(inode); ssize_t ret; - _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + kenter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; @@ -523,17 +523,23 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct netfs_group *group; struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); vm_fault_t ret = VM_FAULT_RETRY; int err; - _enter("%lx", folio->index); + kenter("%lx", folio->index); sb_start_pagefault(inode->i_sb); if (folio_lock_killable(folio) < 0) goto out; + if (folio->mapping != mapping) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } if (folio_wait_writeback_killable(folio)) { ret = VM_FAULT_LOCKED; @@ -549,9 +555,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr group = netfs_folio_group(folio); if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { folio_unlock(folio); - err = filemap_fdatawait_range(inode->i_mapping, - folio_pos(folio), - folio_pos(folio) + folio_size(folio)); + err = filemap_fdatawrite_range(mapping, + folio_pos(folio), + folio_pos(folio) + folio_size(folio)); switch (err) { case 0: ret = VM_FAULT_RETRY; diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 10a1e4da6bda..b6debac6205f 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -33,7 +33,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i size_t orig_count = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); - _enter(""); + kenter(""); if (!orig_count) return 0; /* Don't update atime */ diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index e14cd53ac9fd..792ef17bae21 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -37,7 +37,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * size_t len = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); - _enter(""); + kenter(""); /* We're going to need a bounce buffer if what we transmit is going to * be different in some way to the source buffer, e.g. because it gets @@ -45,7 +45,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * */ // TODO - _debug("uw %llx-%llx", start, end); + kdebug("uw %llx-%llx", start, end); wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start, iocb->ki_flags & IOCB_DIRECT ? @@ -92,10 +92,11 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); if (async) wreq->iocb = iocb; + wreq->len = iov_iter_count(&wreq->io_iter); wreq->cleanup = netfs_cleanup_dio_write; - ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter)); + ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); if (ret < 0) { - _debug("begin = %zd", ret); + kdebug("begin = %zd", ret); goto out; } @@ -142,7 +143,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t pos = iocb->ki_pos; unsigned long long end = pos + iov_iter_count(from) - 1; - _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); + kenter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index 9397ed39b0b4..288a73c3072d 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -237,7 +237,7 @@ int fscache_add_cache(struct fscache_cache *cache, { int n_accesses; - _enter("{%s,%s}", ops->name, cache->name); + kenter("{%s,%s}", ops->name, cache->name); BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING); @@ -257,7 +257,7 @@ int fscache_add_cache(struct fscache_cache *cache, up_write(&fscache_addremove_sem); pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name); - _leave(" = 0 [%s]", cache->name); + kleave(" = 0 [%s]", cache->name); return 0; } EXPORT_SYMBOL(fscache_add_cache); diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index bce2492186d0..4d1e8bf4c615 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -456,7 +456,7 @@ struct fscache_cookie *__fscache_acquire_cookie( { struct fscache_cookie *cookie; - _enter("V=%x", volume->debug_id); + kenter("V=%x", volume->debug_id); if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255) return NULL; @@ -484,7 +484,7 @@ struct fscache_cookie *__fscache_acquire_cookie( trace_fscache_acquire(cookie); fscache_stat(&fscache_n_acquires_ok); - _leave(" = c=%08x", cookie->debug_id); + kleave(" = c=%08x", cookie->debug_id); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); @@ -505,7 +505,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed; bool need_withdraw = false; - _enter(""); + kenter(""); if (!cookie->volume->cache_priv) { fscache_create_volume(cookie->volume, true); @@ -519,7 +519,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) if (cookie->state != FSCACHE_COOKIE_STATE_FAILED) fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); need_withdraw = true; - _leave(" [fail]"); + kleave(" [fail]"); goto out; } @@ -572,7 +572,7 @@ void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify) bool queue = false; int n_active; - _enter("c=%08x", cookie->debug_id); + kenter("c=%08x", cookie->debug_id); if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), "Trying to use relinquished cookie\n")) @@ -636,7 +636,7 @@ again: spin_unlock(&cookie->lock); if (queue) fscache_queue_cookie(cookie, fscache_cookie_get_use_work); - _leave(""); + kleave(""); } EXPORT_SYMBOL(__fscache_use_cookie); @@ -702,7 +702,7 @@ static void fscache_cookie_state_machine(struct fscache_cookie *cookie) enum fscache_cookie_state state; bool wake = false; - _enter("c=%x", cookie->debug_id); + kenter("c=%x", cookie->debug_id); again: spin_lock(&cookie->lock); @@ -820,7 +820,7 @@ out: spin_unlock(&cookie->lock); if (wake) wake_up_cookie_state(cookie); - _leave(""); + kleave(""); } static void fscache_cookie_worker(struct work_struct *work) @@ -867,7 +867,7 @@ static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie) set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags); spin_unlock(&cookie->lock); fscache_stat(&fscache_n_cookies_lru_expired); - _debug("lru c=%x", cookie->debug_id); + kdebug("lru c=%x", cookie->debug_id); __fscache_withdraw_cookie(cookie); } @@ -971,7 +971,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) if (retire) fscache_stat(&fscache_n_relinquishes_retire); - _enter("c=%08x{%d},%d", + kenter("c=%08x{%d},%d", cookie->debug_id, atomic_read(&cookie->n_active), retire); if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), @@ -1050,7 +1050,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, { bool is_caching; - _enter("c=%x", cookie->debug_id); + kenter("c=%x", cookie->debug_id); fscache_stat(&fscache_n_invalidates); @@ -1072,7 +1072,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */ default: spin_unlock(&cookie->lock); - _leave(" [no %u]", cookie->state); + kleave(" [no %u]", cookie->state); return; case FSCACHE_COOKIE_STATE_LOOKING_UP: @@ -1081,7 +1081,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, fallthrough; case FSCACHE_COOKIE_STATE_CREATING: spin_unlock(&cookie->lock); - _leave(" [look %x]", cookie->inval_counter); + kleave(" [look %x]", cookie->inval_counter); return; case FSCACHE_COOKIE_STATE_ACTIVE: @@ -1094,7 +1094,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, if (is_caching) fscache_queue_cookie(cookie, fscache_cookie_get_inval_work); - _leave(" [inv]"); + kleave(" [inv]"); return; } } diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 38637e5c9b57..bf4eaeec44fb 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -28,12 +28,12 @@ bool fscache_wait_for_operation(struct netfs_cache_resources *cres, again: if (!fscache_cache_is_live(cookie->volume->cache)) { - _leave(" [broken]"); + kleave(" [broken]"); return false; } state = fscache_cookie_state(cookie); - _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); + kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state); switch (state) { case FSCACHE_COOKIE_STATE_CREATING: @@ -52,7 +52,7 @@ again: case FSCACHE_COOKIE_STATE_DROPPED: case FSCACHE_COOKIE_STATE_RELINQUISHING: default: - _leave(" [not live]"); + kleave(" [not live]"); return false; } @@ -92,7 +92,7 @@ again: spin_lock(&cookie->lock); state = fscache_cookie_state(cookie); - _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); + kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state); switch (state) { case FSCACHE_COOKIE_STATE_LOOKING_UP: @@ -140,7 +140,7 @@ failed: cres->cache_priv = NULL; cres->ops = NULL; fscache_end_cookie_access(cookie, fscache_access_io_not_live); - _leave(" = -ENOBUFS"); + kleave(" = -ENOBUFS"); return -ENOBUFS; } @@ -224,7 +224,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, if (len == 0) goto abandon; - _enter("%llx,%zx", start, len); + kenter("%llx,%zx", start, len); wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS); if (!wreq) diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index 42e98bb523e3..bf9b33d26e31 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -99,7 +99,7 @@ error_wq: */ void __exit fscache_exit(void) { - _enter(""); + kenter(""); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index cdf991bdd9de..2e2a405ca9b0 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -27,6 +27,19 @@ struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, return volume; } +struct fscache_volume *fscache_try_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where) +{ + int ref; + + if (!__refcount_inc_not_zero(&volume->ref, &ref)) + return NULL; + + trace_fscache_volume(volume->debug_id, ref + 1, where); + return volume; +} +EXPORT_SYMBOL(fscache_try_get_volume); + static void fscache_see_volume(struct fscache_volume *volume, enum fscache_volume_trace where) { @@ -251,7 +264,7 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key, fscache_see_volume(volume, fscache_volume_new_acquire); fscache_stat(&fscache_n_volumes); up_write(&fscache_addremove_sem); - _leave(" = v=%x", volume->debug_id); + kleave(" = v=%x", volume->debug_id); return volume; err_vol: @@ -420,6 +433,7 @@ void fscache_put_volume(struct fscache_volume *volume, fscache_free_volume(volume); } } +EXPORT_SYMBOL(fscache_put_volume); /* * Relinquish a volume representation cookie. @@ -452,7 +466,7 @@ void fscache_withdraw_volume(struct fscache_volume *volume) { int n_accesses; - _debug("withdraw V=%x", volume->debug_id); + kdebug("withdraw V=%x", volume->debug_id); /* Allow wakeups on dec-to-0 */ n_accesses = atomic_dec_return(&volume->n_accesses); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 95e281a8af78..21e46bc9aa49 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -34,7 +34,6 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); /* * main.c */ -extern unsigned int netfs_debug; extern struct list_head netfs_io_requests; extern spinlock_t netfs_proc_lock; extern mempool_t netfs_request_pool; @@ -63,15 +62,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} /* * misc.c */ -#define NETFS_FLAG_PUT_MARK BIT(0) -#define NETFS_FLAG_PAGECACHE_MARK BIT(1) -int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, - struct folio *folio, unsigned int flags, - gfp_t gfp_mask); -int netfs_add_folios_to_buffer(struct xarray *buffer, - struct address_space *mapping, - pgoff_t index, pgoff_t to, gfp_t gfp_mask); -void netfs_clear_buffer(struct xarray *buffer); /* * objects.c @@ -353,8 +343,6 @@ extern const struct seq_operations fscache_volumes_seq_ops; struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, enum fscache_volume_trace where); -void fscache_put_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); bool fscache_begin_volume_access(struct fscache_volume *volume, struct fscache_cookie *cookie, enum fscache_access_trace why); @@ -365,42 +353,12 @@ void fscache_create_volume(struct fscache_volume *volume, bool wait); * debug tracing */ #define dbgprintk(FMT, ...) \ - printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + pr_debug("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) -#ifdef __KDEBUG -#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) -#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) -#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) - -#elif defined(CONFIG_NETFS_DEBUG) -#define _enter(FMT, ...) \ -do { \ - if (netfs_debug) \ - kenter(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _leave(FMT, ...) \ -do { \ - if (netfs_debug) \ - kleave(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _debug(FMT, ...) \ -do { \ - if (netfs_debug) \ - kdebug(FMT, ##__VA_ARGS__); \ -} while (0) - -#else -#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) -#endif - /* * assertions */ diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c93851b98368..c7576481c321 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -130,7 +130,7 @@ static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, if (count == remaining) return; - _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + kdebug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", rreq->debug_id, subreq->debug_index, iov_iter_count(&subreq->io_iter), subreq->transferred, subreq->len, rreq->i_size, @@ -326,7 +326,7 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; int u; - _enter("R=%x[%x]{%llx,%lx},%zd", + kenter("R=%x[%x]{%llx,%lx},%zd", rreq->debug_id, subreq->debug_index, subreq->start, subreq->flags, transferred_or_error); @@ -435,7 +435,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, struct netfs_inode *ictx = netfs_inode(rreq->inode); size_t lsize; - _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); + kenter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); if (rreq->origin != NETFS_DIO_READ) { source = netfs_cache_prepare_read(subreq, rreq->i_size); @@ -518,7 +518,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, subreq->start = rreq->start + rreq->submitted; subreq->len = io_iter->count; - _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); + kdebug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); /* Call out to the cache to find out what it can do with the remaining @@ -570,7 +570,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) struct iov_iter io_iter; int ret; - _enter("R=%x %llx-%llx", + kenter("R=%x %llx-%llx", rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); if (rreq->len == 0) { @@ -593,7 +593,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) atomic_set(&rreq->nr_outstanding, 1); io_iter = rreq->io_iter; do { - _debug("submit %llx + %llx >= %llx", + kdebug("submit %llx + %llx >= %llx", rreq->start, rreq->submitted, rreq->i_size); if (rreq->origin == NETFS_DIO_READ && rreq->start + rreq->submitted >= rreq->i_size) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 5f0f438e5d21..db824c372842 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -20,10 +20,6 @@ MODULE_LICENSE("GPL"); EXPORT_TRACEPOINT_SYMBOL(netfs_sreq); -unsigned netfs_debug; -module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); - static struct kmem_cache *netfs_request_slab; static struct kmem_cache *netfs_subrequest_slab; mempool_t netfs_request_pool; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index bc1fc54fb724..172808e83ca8 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -8,87 +8,6 @@ #include <linux/swap.h> #include "internal.h" -/* - * Attach a folio to the buffer and maybe set marks on it to say that we need - * to put the folio later and twiddle the pagecache flags. - */ -int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, - struct folio *folio, unsigned int flags, - gfp_t gfp_mask) -{ - XA_STATE_ORDER(xas, xa, index, folio_order(folio)); - -retry: - xas_lock(&xas); - for (;;) { - xas_store(&xas, folio); - if (!xas_error(&xas)) - break; - xas_unlock(&xas); - if (!xas_nomem(&xas, gfp_mask)) - return xas_error(&xas); - goto retry; - } - - if (flags & NETFS_FLAG_PUT_MARK) - xas_set_mark(&xas, NETFS_BUF_PUT_MARK); - if (flags & NETFS_FLAG_PAGECACHE_MARK) - xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK); - xas_unlock(&xas); - return xas_error(&xas); -} - -/* - * Create the specified range of folios in the buffer attached to the read - * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that - * these need freeing later. - */ -int netfs_add_folios_to_buffer(struct xarray *buffer, - struct address_space *mapping, - pgoff_t index, pgoff_t to, gfp_t gfp_mask) -{ - struct folio *folio; - int ret; - - if (to + 1 == index) /* Page range is inclusive */ - return 0; - - do { - /* TODO: Figure out what order folio can be allocated here */ - folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0); - if (!folio) - return -ENOMEM; - folio->index = index; - ret = netfs_xa_store_and_mark(buffer, index, folio, - NETFS_FLAG_PUT_MARK, gfp_mask); - if (ret < 0) { - folio_put(folio); - return ret; - } - - index += folio_nr_pages(folio); - } while (index <= to && index != 0); - - return 0; -} - -/* - * Clear an xarray buffer, putting a ref on the folios that have - * NETFS_BUF_PUT_MARK set. - */ -void netfs_clear_buffer(struct xarray *buffer) -{ - struct folio *folio; - XA_STATE(xas, buffer, 0); - - rcu_read_lock(); - xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) { - folio_put(folio); - } - rcu_read_unlock(); - xa_destroy(buffer); -} - /** * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback * @mapping: The mapping the folio belongs to. @@ -107,7 +26,7 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) struct fscache_cookie *cookie = netfs_i_cookie(ictx); bool need_use = false; - _enter(""); + kenter(""); if (!filemap_dirty_folio(mapping, folio)) return false; @@ -180,7 +99,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) struct netfs_folio *finfo; size_t flen = folio_size(folio); - _enter("{%lx},%zx,%zx", folio->index, offset, length); + kenter("{%lx},%zx,%zx", folio->index, offset, length); if (!folio_test_private(folio)) return; diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 426cf87aaf2e..488147439fe0 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -161,7 +161,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, { struct list_head *next; - _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); + kenter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); if (list_empty(&stream->subrequests)) return; @@ -374,7 +374,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) unsigned int notes; int s; - _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); + kenter("%llx-%llx", wreq->start, wreq->start + wreq->len); trace_netfs_collect(wreq); trace_netfs_rreq(wreq, netfs_rreq_trace_collect); @@ -409,7 +409,7 @@ reassess_streams: front = stream->front; while (front) { trace_netfs_collect_sreq(wreq, front); - //_debug("sreq [%x] %llx %zx/%zx", + //kdebug("sreq [%x] %llx %zx/%zx", // front->debug_index, front->start, front->transferred, front->len); /* Stall if there may be a discontinuity. */ @@ -598,7 +598,7 @@ reassess_streams: out: netfs_put_group_many(wreq->group, wreq->nr_group_rel); wreq->nr_group_rel = 0; - _leave(" = %x", notes); + kleave(" = %x", notes); return; need_retry: @@ -606,7 +606,7 @@ need_retry: * that any partially completed op will have had any wholly transferred * folios removed from it. */ - _debug("retry"); + kdebug("retry"); netfs_retry_writes(wreq); goto out; } @@ -621,7 +621,7 @@ void netfs_write_collection_worker(struct work_struct *work) size_t transferred; int s; - _enter("R=%x", wreq->debug_id); + kenter("R=%x", wreq->debug_id); netfs_see_request(wreq, netfs_rreq_trace_see_work); if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) { @@ -684,7 +684,7 @@ void netfs_write_collection_worker(struct work_struct *work) if (wreq->origin == NETFS_DIO_WRITE) inode_dio_end(wreq->inode); - _debug("finished"); + kdebug("finished"); trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); @@ -744,7 +744,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, struct netfs_io_request *wreq = subreq->rreq; struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; - _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + kenter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); switch (subreq->source) { case NETFS_UPLOAD_TO_SERVER: diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 3aa86e268f40..d7c971df8866 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -99,7 +99,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, if (IS_ERR(wreq)) return wreq; - _enter("R=%x", wreq->debug_id); + kenter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) @@ -159,7 +159,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, subreq->max_nr_segs = INT_MAX; subreq->stream_nr = stream->stream_nr; - _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); + kenter("R=%x[%x]", wreq->debug_id, subreq->debug_index); trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), @@ -215,7 +215,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, { struct netfs_io_request *wreq = subreq->rreq; - _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); + kenter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) return netfs_write_subrequest_terminated(subreq, subreq->error, false); @@ -272,11 +272,11 @@ int netfs_advance_write(struct netfs_io_request *wreq, size_t part; if (!stream->avail) { - _leave("no write"); + kleave("no write"); return len; } - _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); + kenter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); if (subreq && start != subreq->start + subreq->len) { netfs_issue_write(wreq, stream); @@ -288,7 +288,7 @@ int netfs_advance_write(struct netfs_io_request *wreq, subreq = stream->construct; part = min(subreq->max_len - subreq->len, len); - _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); + kdebug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); subreq->len += part; subreq->nr_segs++; @@ -319,7 +319,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, bool to_eof = false, streamw = false; bool debug = false; - _enter(""); + kenter(""); /* netfs_perform_write() may shift i_size around the page or from out * of the page to beyond it, but cannot move i_size into or through the @@ -329,7 +329,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, if (fpos >= i_size) { /* mmap beyond eof. */ - _debug("beyond eof"); + kdebug("beyond eof"); folio_start_writeback(folio); folio_unlock(folio); wreq->nr_group_rel += netfs_folio_written_back(folio); @@ -363,7 +363,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, } flen -= foff; - _debug("folio %zx %zx %zx", foff, flen, fsize); + kdebug("folio %zx %zx %zx", foff, flen, fsize); /* Deal with discontinuities in the stream of dirty pages. These can * arise from a number of sources: @@ -483,11 +483,11 @@ static int netfs_write_folio(struct netfs_io_request *wreq, if (!debug) kdebug("R=%x: No submit", wreq->debug_id); - if (flen < fsize) + if (foff + flen < fsize) for (int s = 0; s < NR_IO_STREAMS; s++) netfs_issue_write(wreq, &wreq->io_streams[s]); - _leave(" = 0"); + kleave(" = 0"); return 0; } @@ -522,7 +522,7 @@ int netfs_writepages(struct address_space *mapping, netfs_stat(&netfs_n_wh_writepages); do { - _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); + kdebug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); /* It appears we don't have to handle cyclic writeback wrapping. */ WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); @@ -546,14 +546,14 @@ int netfs_writepages(struct address_space *mapping, mutex_unlock(&ictx->wb_lock); netfs_put_request(wreq, false, netfs_rreq_trace_put_return); - _leave(" = %d", error); + kleave(" = %d", error); return error; couldnt_start: netfs_kill_dirty_pages(mapping, wbc, folio); out: mutex_unlock(&ictx->wb_lock); - _leave(" = %d", error); + kleave(" = %d", error); return error; } EXPORT_SYMBOL(netfs_writepages); @@ -590,7 +590,7 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c struct folio *folio, size_t copied, bool to_page_end, struct folio **writethrough_cache) { - _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", + kenter("R=%x ic=%zu ws=%u cp=%zu tp=%u", wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end); if (!*writethrough_cache) { @@ -624,7 +624,7 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr struct netfs_inode *ictx = netfs_inode(wreq->inode); int ret; - _enter("R=%x", wreq->debug_id); + kenter("R=%x", wreq->debug_id); if (writethrough_cache) netfs_write_folio(wreq, wbc, writethrough_cache); @@ -657,7 +657,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t loff_t start = wreq->start; int error = 0; - _enter("%zx", len); + kenter("%zx", len); if (wreq->origin == NETFS_DIO_WRITE) inode_dio_begin(wreq->inode); @@ -665,7 +665,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t while (len) { // TODO: Prepare content encryption - _debug("unbuffered %zx", len); + kdebug("unbuffered %zx", len); part = netfs_advance_write(wreq, upload, start, len, false); start += part; len -= part; @@ -684,6 +684,6 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t if (list_empty(&upload->subrequests)) netfs_wake_write_collector(wreq, false); - _leave(" = %d", error); + kleave(" = %d", error); return error; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a142287d86f6..cca80b5f54e0 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -122,8 +122,6 @@ static void nfs_readpage_release(struct nfs_page *req, int error) { struct folio *folio = nfs_page_to_folio(req); - if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) - folio_set_error(folio); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) if (nfs_netfs_folio_unlock(folio)) folio_unlock(folio); diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 13818129d268..1c62a5a9f51d 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -32,15 +32,7 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio) int error; error = NFS_PROTO(inode)->readlink(inode, &folio->page, 0, PAGE_SIZE); - if (error < 0) - goto error; - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; - -error: - folio_set_error(folio); - folio_unlock(folio); + folio_end_read(folio, error == 0); return error; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2329cbb0e446..a91463ab87a0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -311,7 +311,6 @@ static void nfs_mapping_set_error(struct folio *folio, int error) { struct address_space *mapping = folio_file_mapping(folio); - folio_set_error(folio); filemap_set_wb_err(mapping, error); if (mapping->host) errseq_set(&mapping->host->i_sb->s_wb_err, diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 0b75305fb5f5..dd4e11a703aa 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -247,7 +247,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) dentry = dget(exp->ex_path.dentry); else { dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid, - data_left, fileid_type, + data_left, fileid_type, 0, nfsd_acceptable, exp); if (IS_ERR_OR_NULL(dentry)) { trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp, diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 89caef7513db..ba50388ee4bf 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, * @target: offset number of an entry in the group (start point) * @bsize: size in bits * @lock: spin lock protecting @bitmap + * @wrap: whether to wrap around */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, unsigned int bsize, - spinlock_t *lock) + spinlock_t *lock, bool wrap) { int pos, end = bsize; @@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, end = target; } + if (!wrap) + return -ENOSPC; /* wrap around */ for (pos = 0; pos < end; pos++) { @@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation + * @wrap: whether to wrap around */ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, - struct nilfs_palloc_req *req) + struct nilfs_palloc_req *req, bool wrap) { struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; @@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, entries_per_group = nilfs_palloc_entries_per_group(inode); for (i = 0; i < ngroups; i += n) { - if (group >= ngroups) { + if (group >= ngroups && wrap) { /* wrap around */ group = 0; maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, @@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( - bitmap, group_offset, entries_per_group, lock); + bitmap, group_offset, entries_per_group, lock, + wrap); + /* + * Since the search for a free slot in the second and + * subsequent bitmap blocks always starts from the + * beginning, the wrap flag only has an effect on the + * first search. + */ kunmap_local(bitmap_kaddr); if (pos >= 0) goto found; diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index b667e869ac07..d825a9faca6d 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -50,8 +50,8 @@ struct nilfs_palloc_req { struct buffer_head *pr_entry_bh; }; -int nilfs_palloc_prepare_alloc_entry(struct inode *, - struct nilfs_palloc_req *); +int nilfs_palloc_prepare_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req, bool wrap); void nilfs_palloc_commit_alloc_entry(struct inode *, struct nilfs_palloc_req *); void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 180fc8d36213..fc1caf63a42a 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) { int ret; - ret = nilfs_palloc_prepare_alloc_entry(dat, req); + ret = nilfs_palloc_prepare_alloc_entry(dat, req, true); if (ret < 0) return ret; diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 52e50b1b7f22..4a29b0138d75 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct folio *folio, char *kaddr) goto Enamelen; if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) goto Espan; + if (unlikely(p->inode && + NILFS_PRIVATE_INODE(le64_to_cpu(p->inode)))) + goto Einumber; } if (offs != limit) goto Eend; @@ -160,6 +163,9 @@ Enamelen: goto bad_entry; Espan: error = "directory entry across blocks"; + goto bad_entry; +Einumber: + error = "disallowed inode number"; bad_entry: nilfs_error(sb, "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", @@ -377,11 +383,39 @@ found: struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop) { - struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop); + struct folio *folio; + struct nilfs_dir_entry *de, *next_de; + size_t limit; + char *msg; + de = nilfs_get_folio(dir, 0, &folio); if (IS_ERR(de)) return NULL; - return nilfs_next_entry(de); + + limit = nilfs_last_byte(dir, 0); /* is a multiple of chunk size */ + if (unlikely(!limit || le64_to_cpu(de->inode) != dir->i_ino || + !nilfs_match(1, ".", de))) { + msg = "missing '.'"; + goto fail; + } + + next_de = nilfs_next_entry(de); + /* + * If "next_de" has not reached the end of the chunk, there is + * at least one more record. Check whether it matches "..". + */ + if (unlikely((char *)next_de == (char *)de + nilfs_chunk_size(dir) || + !nilfs_match(2, "..", next_de))) { + msg = "missing '..'"; + goto fail; + } + *foliop = folio; + return next_de; + +fail: + nilfs_error(dir->i_sb, "directory #%lu %s", dir->i_ino, msg); + folio_release_kmap(folio, de); + return NULL; } ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 612e609158b5..1e86b9303b7c 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, struct nilfs_palloc_req req; int ret; - req.pr_entry_nr = 0; /* - * 0 says find free inode from beginning - * of a group. dull code!! - */ + req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb); req.pr_entry_bh = NULL; - ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); + ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false); if (!ret) { ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, &req.pr_entry_bh); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 728e90be3570..4017f7856440 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -116,9 +116,15 @@ enum { #define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino) #define NILFS_MDT_INODE(sb, ino) \ - ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino))) + ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino))) #define NILFS_VALID_INODE(sb, ino) \ - ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino))) + ((ino) >= NILFS_FIRST_INO(sb) || \ + ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino)))) + +#define NILFS_PRIVATE_INODE(ino) ({ \ + ino_t __ino = (ino); \ + ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \ + (__ino) != NILFS_SKETCH_INO); }) /** * struct nilfs_transaction_info: context information for synchronization diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index f41d7b6d432c..e44dde57ab65 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, } nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); + if (nilfs->ns_first_ino < NILFS_USER_INO) { + nilfs_err(nilfs->ns_sb, + "too small lower limit for non-reserved inode numbers: %u", + nilfs->ns_first_ino); + return -EINVAL; + } nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 85da0629415d..1e829ed7b0ef 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -182,7 +182,7 @@ struct the_nilfs { unsigned long ns_nrsvsegs; unsigned long ns_first_data_block; int ns_inode_size; - int ns_first_ino; + unsigned int ns_first_ino; u32 ns_crc_seed; /* /sys/fs/<nilfs>/<device> */ diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c index 266c2d7d50bd..2963f3299d7e 100644 --- a/fs/nls/mac-celtic.c +++ b/fs/nls/mac-celtic.c @@ -598,4 +598,5 @@ static void __exit exit_nls_macceltic(void) module_init(init_nls_macceltic) module_exit(exit_nls_macceltic) +MODULE_DESCRIPTION("NLS Codepage macceltic"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c index 9789c6057551..43b20f4bdb67 100644 --- a/fs/nls/mac-centeuro.c +++ b/fs/nls/mac-centeuro.c @@ -528,4 +528,5 @@ static void __exit exit_nls_maccenteuro(void) module_init(init_nls_maccenteuro) module_exit(exit_nls_maccenteuro) +MODULE_DESCRIPTION("NLS Codepage maccenteuro"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c index bb19e7a07d43..62730d6a64e5 100644 --- a/fs/nls/mac-croatian.c +++ b/fs/nls/mac-croatian.c @@ -598,4 +598,5 @@ static void __exit exit_nls_maccroatian(void) module_init(init_nls_maccroatian) module_exit(exit_nls_maccroatian) +MODULE_DESCRIPTION("NLS Codepage maccroatian"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c index 2a7dea36acba..7a5c4d16aac8 100644 --- a/fs/nls/mac-cyrillic.c +++ b/fs/nls/mac-cyrillic.c @@ -493,4 +493,5 @@ static void __exit exit_nls_maccyrillic(void) module_init(init_nls_maccyrillic) module_exit(exit_nls_maccyrillic) +MODULE_DESCRIPTION("NLS Codepage maccyrillic"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c index 77b001653588..3d22f03a90b6 100644 --- a/fs/nls/mac-gaelic.c +++ b/fs/nls/mac-gaelic.c @@ -563,4 +563,5 @@ static void __exit exit_nls_macgaelic(void) module_init(init_nls_macgaelic) module_exit(exit_nls_macgaelic) +MODULE_DESCRIPTION("NLS Codepage macgaelic"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c index 1eccf499e2eb..de3aa9ddb5b1 100644 --- a/fs/nls/mac-greek.c +++ b/fs/nls/mac-greek.c @@ -493,4 +493,5 @@ static void __exit exit_nls_macgreek(void) module_init(init_nls_macgreek) module_exit(exit_nls_macgreek) +MODULE_DESCRIPTION("NLS Codepage macgreek"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c index cbd0875c6d69..0bba83f9d415 100644 --- a/fs/nls/mac-iceland.c +++ b/fs/nls/mac-iceland.c @@ -598,4 +598,5 @@ static void __exit exit_nls_maciceland(void) module_init(init_nls_maciceland) module_exit(exit_nls_maciceland) +MODULE_DESCRIPTION("NLS Codepage maciceland"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c index fba8357aaf03..493386832dfd 100644 --- a/fs/nls/mac-inuit.c +++ b/fs/nls/mac-inuit.c @@ -528,4 +528,5 @@ static void __exit exit_nls_macinuit(void) module_init(init_nls_macinuit) module_exit(exit_nls_macinuit) +MODULE_DESCRIPTION("NLS Codepage macinuit"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c index b6a98a5208cd..d3c082173c20 100644 --- a/fs/nls/mac-roman.c +++ b/fs/nls/mac-roman.c @@ -633,4 +633,5 @@ static void __exit exit_nls_macroman(void) module_init(init_nls_macroman) module_exit(exit_nls_macroman) +MODULE_DESCRIPTION("NLS Codepage macroman"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c index 25547f023638..a7735852f2d5 100644 --- a/fs/nls/mac-romanian.c +++ b/fs/nls/mac-romanian.c @@ -598,4 +598,5 @@ static void __exit exit_nls_macromanian(void) module_init(init_nls_macromanian) module_exit(exit_nls_macromanian) +MODULE_DESCRIPTION("NLS Codepage macromanian"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c index b5454bc7b7fa..d77e9b6b7d7c 100644 --- a/fs/nls/mac-turkish.c +++ b/fs/nls/mac-turkish.c @@ -598,4 +598,5 @@ static void __exit exit_nls_macturkish(void) module_init(init_nls_macturkish) module_exit(exit_nls_macturkish) +MODULE_DESCRIPTION("NLS Codepage macturkish"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index a2620650d5e4..068143d71284 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -163,4 +163,5 @@ static void __exit exit_nls_ascii(void) module_init(init_nls_ascii) module_exit(exit_nls_ascii) +MODULE_DESCRIPTION("NLS ASCII (United States)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index a026dbd3593f..18d597e49a19 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -545,4 +545,5 @@ EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); EXPORT_SYMBOL(load_nls_default); +MODULE_DESCRIPTION("Base file system native language support"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c index ace3e19d3407..e22a57a4b828 100644 --- a/fs/nls/nls_cp1250.c +++ b/fs/nls/nls_cp1250.c @@ -343,4 +343,5 @@ static void __exit exit_nls_cp1250(void) module_init(init_nls_cp1250) module_exit(exit_nls_cp1250) +MODULE_DESCRIPTION("NLS Windows CP1250 (Slavic/Central European Languages)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c index 9273ddfd08a1..6f46d339f23c 100644 --- a/fs/nls/nls_cp1251.c +++ b/fs/nls/nls_cp1251.c @@ -298,4 +298,5 @@ static void __exit exit_nls_cp1251(void) module_init(init_nls_cp1251) module_exit(exit_nls_cp1251) +MODULE_DESCRIPTION("NLS Windows CP1251 (Bulgarian, Belarusian)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c index 1caf5dfed85b..299e089d4301 100644 --- a/fs/nls/nls_cp1255.c +++ b/fs/nls/nls_cp1255.c @@ -380,5 +380,6 @@ static void __exit exit_nls_cp1255(void) module_init(init_nls_cp1255) module_exit(exit_nls_cp1255) +MODULE_DESCRIPTION("NLS Hebrew charsets (ISO-8859-8, CP1255)"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(iso8859-8); diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c index 7ddb830da3fd..ab880499ea32 100644 --- a/fs/nls/nls_cp437.c +++ b/fs/nls/nls_cp437.c @@ -384,4 +384,5 @@ static void __exit exit_nls_cp437(void) module_init(init_nls_cp437) module_exit(exit_nls_cp437) +MODULE_DESCRIPTION("NLS Codepage 437 (United States, Canada)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c index c593f683a0cd..5c37618296e9 100644 --- a/fs/nls/nls_cp737.c +++ b/fs/nls/nls_cp737.c @@ -347,4 +347,5 @@ static void __exit exit_nls_cp737(void) module_init(init_nls_cp737) module_exit(exit_nls_cp737) +MODULE_DESCRIPTION("NLS Codepage 737 (Greek)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c index 554c863745f2..51ccc908901f 100644 --- a/fs/nls/nls_cp775.c +++ b/fs/nls/nls_cp775.c @@ -316,4 +316,5 @@ static void __exit exit_nls_cp775(void) module_init(init_nls_cp775) module_exit(exit_nls_cp775) +MODULE_DESCRIPTION("NLS Codepage 775 (Baltic Rim)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c index 56cccd14b40b..5f9b9507a8b6 100644 --- a/fs/nls/nls_cp850.c +++ b/fs/nls/nls_cp850.c @@ -312,4 +312,5 @@ static void __exit exit_nls_cp850(void) module_init(init_nls_cp850) module_exit(exit_nls_cp850) +MODULE_DESCRIPTION("NLS Codepage 850 (Europe)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c index 7cdc05ac1d40..fc513a5e8358 100644 --- a/fs/nls/nls_cp852.c +++ b/fs/nls/nls_cp852.c @@ -334,4 +334,5 @@ static void __exit exit_nls_cp852(void) module_init(init_nls_cp852) module_exit(exit_nls_cp852) +MODULE_DESCRIPTION("NLS Codepage 852 (Central/Eastern Europe)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c index 7426eea05663..a43be58adb36 100644 --- a/fs/nls/nls_cp855.c +++ b/fs/nls/nls_cp855.c @@ -296,4 +296,5 @@ static void __exit exit_nls_cp855(void) module_init(init_nls_cp855) module_exit(exit_nls_cp855) +MODULE_DESCRIPTION("NLS Codepage 855 (Cyrillic)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c index 098309733ebd..772cd4195bad 100644 --- a/fs/nls/nls_cp857.c +++ b/fs/nls/nls_cp857.c @@ -298,4 +298,5 @@ static void __exit exit_nls_cp857(void) module_init(init_nls_cp857) module_exit(exit_nls_cp857) +MODULE_DESCRIPTION("NLS Codepage 857 (Turkish)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c index 84224478e731..36cf4ca11966 100644 --- a/fs/nls/nls_cp860.c +++ b/fs/nls/nls_cp860.c @@ -361,4 +361,5 @@ static void __exit exit_nls_cp860(void) module_init(init_nls_cp860) module_exit(exit_nls_cp860) +MODULE_DESCRIPTION("NLS Codepage 860 (Portuguese)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c index dc873e4be092..b7397d079f8f 100644 --- a/fs/nls/nls_cp861.c +++ b/fs/nls/nls_cp861.c @@ -384,4 +384,5 @@ static void __exit exit_nls_cp861(void) module_init(init_nls_cp861) module_exit(exit_nls_cp861) +MODULE_DESCRIPTION("NLS Codepage 861 (Icelandic)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c index d5263e3c5566..fd3b95d1e95d 100644 --- a/fs/nls/nls_cp862.c +++ b/fs/nls/nls_cp862.c @@ -418,4 +418,5 @@ static void __exit exit_nls_cp862(void) module_init(init_nls_cp862) module_exit(exit_nls_cp862) +MODULE_DESCRIPTION("NLS Codepage 862 (Hebrew)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c index 051c9832e36a..813ae7944249 100644 --- a/fs/nls/nls_cp863.c +++ b/fs/nls/nls_cp863.c @@ -378,4 +378,5 @@ static void __exit exit_nls_cp863(void) module_init(init_nls_cp863) module_exit(exit_nls_cp863) +MODULE_DESCRIPTION("NLS Codepage 863 (Canadian French)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c index 97eb1273b2f7..d9eb6d5cd47a 100644 --- a/fs/nls/nls_cp864.c +++ b/fs/nls/nls_cp864.c @@ -404,4 +404,5 @@ static void __exit exit_nls_cp864(void) module_init(init_nls_cp864) module_exit(exit_nls_cp864) +MODULE_DESCRIPTION("NLS Codepage 864 (Arabic)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c index 111214228525..2678ffd98bb6 100644 --- a/fs/nls/nls_cp865.c +++ b/fs/nls/nls_cp865.c @@ -384,4 +384,5 @@ static void __exit exit_nls_cp865(void) module_init(init_nls_cp865) module_exit(exit_nls_cp865) +MODULE_DESCRIPTION("NLS Codepage 865 (Norwegian, Danish)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c index ffdcbc3fc38d..7e93d0a3802a 100644 --- a/fs/nls/nls_cp866.c +++ b/fs/nls/nls_cp866.c @@ -302,4 +302,5 @@ static void __exit exit_nls_cp866(void) module_init(init_nls_cp866) module_exit(exit_nls_cp866) +MODULE_DESCRIPTION("NLS Codepage 866 (Cyrillic/Russian)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c index 3b5a34589354..4491737dd5cb 100644 --- a/fs/nls/nls_cp869.c +++ b/fs/nls/nls_cp869.c @@ -312,4 +312,5 @@ static void __exit exit_nls_cp869(void) module_init(init_nls_cp869) module_exit(exit_nls_cp869) +MODULE_DESCRIPTION("NLS Codepage 869 (Greek)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c index 8dfaa10710fa..4fcfbf8ca72c 100644 --- a/fs/nls/nls_cp874.c +++ b/fs/nls/nls_cp874.c @@ -271,5 +271,6 @@ static void __exit exit_nls_cp874(void) module_init(init_nls_cp874) module_exit(exit_nls_cp874) +MODULE_DESCRIPTION("NLS Thai charset (CP874, TIS-620)"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(tis-620); diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c index 67b7398e8483..e5e6270fcca6 100644 --- a/fs/nls/nls_cp932.c +++ b/fs/nls/nls_cp932.c @@ -7929,5 +7929,6 @@ static void __exit exit_nls_cp932(void) module_init(init_nls_cp932) module_exit(exit_nls_cp932) +MODULE_DESCRIPTION("NLS Japanese charset (Shift-JIS)"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(sjis); diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c index c96546cfec9f..91d0a15fd7f9 100644 --- a/fs/nls/nls_cp936.c +++ b/fs/nls/nls_cp936.c @@ -11107,5 +11107,6 @@ static void __exit exit_nls_cp936(void) module_init(init_nls_cp936) module_exit(exit_nls_cp936) +MODULE_DESCRIPTION("NLS Simplified Chinese charset (CP936, GB2312)"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(gb2312); diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c index 199171e97aa4..3ae03c76d59c 100644 --- a/fs/nls/nls_cp949.c +++ b/fs/nls/nls_cp949.c @@ -13942,5 +13942,6 @@ static void __exit exit_nls_cp949(void) module_init(init_nls_cp949) module_exit(exit_nls_cp949) +MODULE_DESCRIPTION("NLS Korean charset (CP949, EUC-KR)"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(euc-kr); diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c index 8e1418708209..e968aa80198d 100644 --- a/fs/nls/nls_cp950.c +++ b/fs/nls/nls_cp950.c @@ -9478,5 +9478,6 @@ static void __exit exit_nls_cp950(void) module_init(init_nls_cp950) module_exit(exit_nls_cp950) +MODULE_DESCRIPTION("NLS Traditional Chinese charset (Big5)"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(big5); diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 162b3f160353..0191cc9d955e 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -577,4 +577,5 @@ static void __exit exit_nls_euc_jp(void) module_init(init_nls_euc_jp) module_exit(exit_nls_euc_jp) +MODULE_DESCRIPTION("NLS Japanese charset (EUC-JP)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c index 69ac020d43b1..a181be488f7d 100644 --- a/fs/nls/nls_iso8859-1.c +++ b/fs/nls/nls_iso8859-1.c @@ -254,4 +254,5 @@ static void __exit exit_nls_iso8859_1(void) module_init(init_nls_iso8859_1) module_exit(exit_nls_iso8859_1) +MODULE_DESCRIPTION("NLS ISO 8859-1 (Latin 1; Western European Languages)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c index afb3f8f275f0..8e2be5bfeaf1 100644 --- a/fs/nls/nls_iso8859-13.c +++ b/fs/nls/nls_iso8859-13.c @@ -282,4 +282,5 @@ static void __exit exit_nls_iso8859_13(void) module_init(init_nls_iso8859_13) module_exit(exit_nls_iso8859_13) +MODULE_DESCRIPTION("NLS ISO 8859-13 (Latin 7; Baltic)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c index 046370f0b6f0..c789eccb8a69 100644 --- a/fs/nls/nls_iso8859-14.c +++ b/fs/nls/nls_iso8859-14.c @@ -338,4 +338,5 @@ static void __exit exit_nls_iso8859_14(void) module_init(init_nls_iso8859_14) module_exit(exit_nls_iso8859_14) +MODULE_DESCRIPTION("NLS ISO 8859-14 (Latin 8; Celtic)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c index 7e34a841a056..ffec649176fb 100644 --- a/fs/nls/nls_iso8859-15.c +++ b/fs/nls/nls_iso8859-15.c @@ -304,4 +304,5 @@ static void __exit exit_nls_iso8859_15(void) module_init(init_nls_iso8859_15) module_exit(exit_nls_iso8859_15) +MODULE_DESCRIPTION("NLS ISO 8859-15 (Latin 9; Western European Languages with Euro)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c index 7dd571181741..d352334d0314 100644 --- a/fs/nls/nls_iso8859-2.c +++ b/fs/nls/nls_iso8859-2.c @@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_2(void) module_init(init_nls_iso8859_2) module_exit(exit_nls_iso8859_2) +MODULE_DESCRIPTION("NLS ISO 8859-2 (Latin 2; Slavic/Central European Languages)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c index 740b75ec4493..09990e6634d2 100644 --- a/fs/nls/nls_iso8859-3.c +++ b/fs/nls/nls_iso8859-3.c @@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_3(void) module_init(init_nls_iso8859_3) module_exit(exit_nls_iso8859_3) +MODULE_DESCRIPTION("NLS ISO 8859-3 (Latin 3; Esperanto, Galician, Maltese, Turkish)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c index 8826021e32f5..92795224912e 100644 --- a/fs/nls/nls_iso8859-4.c +++ b/fs/nls/nls_iso8859-4.c @@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_4(void) module_init(init_nls_iso8859_4) module_exit(exit_nls_iso8859_4) +MODULE_DESCRIPTION("NLS ISO 8859-4 (Latin 4; old Baltic charset)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c index 7c04057a1ad8..32309315307a 100644 --- a/fs/nls/nls_iso8859-5.c +++ b/fs/nls/nls_iso8859-5.c @@ -269,4 +269,5 @@ static void __exit exit_nls_iso8859_5(void) module_init(init_nls_iso8859_5) module_exit(exit_nls_iso8859_5) +MODULE_DESCRIPTION("NLS ISO 8859-5 (Cyrillic)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c index d4a881400d74..c18183469d2a 100644 --- a/fs/nls/nls_iso8859-6.c +++ b/fs/nls/nls_iso8859-6.c @@ -260,4 +260,5 @@ static void __exit exit_nls_iso8859_6(void) module_init(init_nls_iso8859_6) module_exit(exit_nls_iso8859_6) +MODULE_DESCRIPTION("NLS ISO 8859-6 (Arabic)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c index 37b75d825a75..3652d6832864 100644 --- a/fs/nls/nls_iso8859-7.c +++ b/fs/nls/nls_iso8859-7.c @@ -314,4 +314,5 @@ static void __exit exit_nls_iso8859_7(void) module_init(init_nls_iso8859_7) module_exit(exit_nls_iso8859_7) +MODULE_DESCRIPTION("NLS ISO 8859-7 (Modern Greek)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c index 557b98250d37..11a67834b855 100644 --- a/fs/nls/nls_iso8859-9.c +++ b/fs/nls/nls_iso8859-9.c @@ -269,4 +269,5 @@ static void __exit exit_nls_iso8859_9(void) module_init(init_nls_iso8859_9) module_exit(exit_nls_iso8859_9) +MODULE_DESCRIPTION("NLS ISO 8859-9 (Latin 5; Turkish)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c index 811f232fccfb..e3dca27a3803 100644 --- a/fs/nls/nls_koi8-r.c +++ b/fs/nls/nls_koi8-r.c @@ -320,4 +320,5 @@ static void __exit exit_nls_koi8_r(void) module_init(init_nls_koi8_r) module_exit(exit_nls_koi8_r) +MODULE_DESCRIPTION("NLS KOI8-R (Russian)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index a80a741a8676..07afcd9e58c0 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -79,4 +79,5 @@ static void __exit exit_nls_koi8_ru(void) module_init(init_nls_koi8_ru) module_exit(exit_nls_koi8_ru) +MODULE_DESCRIPTION("NLS KOI8-RU (Belarusian)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c index 7e029e4c188a..f60645758c1a 100644 --- a/fs/nls/nls_koi8-u.c +++ b/fs/nls/nls_koi8-u.c @@ -327,4 +327,5 @@ static void __exit exit_nls_koi8_u(void) module_init(init_nls_koi8_u) module_exit(exit_nls_koi8_u) +MODULE_DESCRIPTION("NLS KOI8-U (Ukrainian)"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_ucs2_utils.c b/fs/nls/nls_ucs2_utils.c index a69781c54dd8..d4564b79d7bf 100644 --- a/fs/nls/nls_ucs2_utils.c +++ b/fs/nls/nls_ucs2_utils.c @@ -16,6 +16,7 @@ #include <asm/unaligned.h> #include "nls_ucs2_utils.h" +MODULE_DESCRIPTION("NLS UCS-2"); MODULE_LICENSE("GPL"); /* diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index afcfbc4a14db..a0fa0610eaac 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -64,4 +64,5 @@ static void __exit exit_nls_utf8(void) module_init(init_nls_utf8) module_exit(exit_nls_utf8) +MODULE_DESCRIPTION("NLS UTF-8"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nsfs.c b/fs/nsfs.c index 07e22a15ef02..a4a925dce331 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -8,10 +8,12 @@ #include <linux/magic.h> #include <linux/ktime.h> #include <linux/seq_file.h> +#include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/nsfs.h> #include <linux/uaccess.h> +#include "mount.h" #include "internal.h" static struct vfsmount *nsfs_mnt; @@ -82,40 +84,47 @@ int ns_get_path(struct path *path, struct task_struct *task, return ns_get_path_cb(path, ns_get_path_task, &args); } -int open_related_ns(struct ns_common *ns, - struct ns_common *(*get_ns)(struct ns_common *ns)) +/** + * open_namespace - open a namespace + * @ns: the namespace to open + * + * This will consume a reference to @ns indendent of success or failure. + * + * Return: A file descriptor on success or a negative error code on failure. + */ +int open_namespace(struct ns_common *ns) { - struct path path = {}; - struct ns_common *relative; + struct path path __free(path_put) = {}; struct file *f; int err; - int fd; - fd = get_unused_fd_flags(O_CLOEXEC); + /* call first to consume reference */ + err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (err < 0) + return err; + + CLASS(get_unused_fd, fd)(O_CLOEXEC); if (fd < 0) return fd; - relative = get_ns(ns); - if (IS_ERR(relative)) { - put_unused_fd(fd); - return PTR_ERR(relative); - } + f = dentry_open(&path, O_RDONLY, current_cred()); + if (IS_ERR(f)) + return PTR_ERR(f); - err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path); - if (err < 0) { - put_unused_fd(fd); - return err; - } + fd_install(fd, f); + return take_fd(fd); +} - f = dentry_open(&path, O_RDONLY, current_cred()); - path_put(&path); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else - fd_install(fd, f); +int open_related_ns(struct ns_common *ns, + struct ns_common *(*get_ns)(struct ns_common *ns)) +{ + struct ns_common *relative; + + relative = get_ns(ns); + if (IS_ERR(relative)) + return PTR_ERR(relative); - return fd; + return open_namespace(relative); } EXPORT_SYMBOL_GPL(open_related_ns); @@ -123,9 +132,12 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct user_namespace *user_ns; + struct pid_namespace *pid_ns; + struct task_struct *tsk; struct ns_common *ns = get_proc_ns(file_inode(filp)); uid_t __user *argp; uid_t uid; + int ret; switch (ioctl) { case NS_GET_USERNS: @@ -143,9 +155,69 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); + case NS_GET_MNTNS_ID: { + struct mnt_namespace *mnt_ns; + __u64 __user *idp; + __u64 id; + + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + + mnt_ns = container_of(ns, struct mnt_namespace, ns); + idp = (__u64 __user *)arg; + id = mnt_ns->seq; + return put_user(id, idp); + } + case NS_GET_PID_FROM_PIDNS: + fallthrough; + case NS_GET_TGID_FROM_PIDNS: + fallthrough; + case NS_GET_PID_IN_PIDNS: + fallthrough; + case NS_GET_TGID_IN_PIDNS: + if (ns->ops->type != CLONE_NEWPID) + return -EINVAL; + + ret = -ESRCH; + pid_ns = container_of(ns, struct pid_namespace, ns); + + rcu_read_lock(); + + if (ioctl == NS_GET_PID_IN_PIDNS || + ioctl == NS_GET_TGID_IN_PIDNS) + tsk = find_task_by_vpid(arg); + else + tsk = find_task_by_pid_ns(arg, pid_ns); + if (!tsk) + break; + + switch (ioctl) { + case NS_GET_PID_FROM_PIDNS: + ret = task_pid_vnr(tsk); + break; + case NS_GET_TGID_FROM_PIDNS: + ret = task_tgid_vnr(tsk); + break; + case NS_GET_PID_IN_PIDNS: + ret = task_pid_nr_ns(tsk, pid_ns); + break; + case NS_GET_TGID_IN_PIDNS: + ret = task_tgid_nr_ns(tsk, pid_ns); + break; + default: + ret = 0; + break; + } + rcu_read_unlock(); + + if (!ret) + ret = -ESRCH; + break; default: - return -ENOTTY; + ret = -ENOTTY; } + + return ret; } int ns_get_name(char *buf, size_t size, struct task_struct *task, diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 27fbde2701b6..c5b688c5f984 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -259,8 +259,8 @@ enum Opt { // clang-format off static const struct fs_parameter_spec ntfs_fs_parameters[] = { - fsparam_u32("uid", Opt_uid), - fsparam_u32("gid", Opt_gid), + fsparam_uid("uid", Opt_uid), + fsparam_gid("gid", Opt_gid), fsparam_u32oct("umask", Opt_umask), fsparam_u32oct("dmask", Opt_dmask), fsparam_u32oct("fmask", Opt_fmask), @@ -319,14 +319,10 @@ static int ntfs_fs_parse_param(struct fs_context *fc, switch (opt) { case Opt_uid: - opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(opts->fs_uid)) - return invalf(fc, "ntfs3: Invalid value for uid."); + opts->fs_uid = result.uid; break; case Opt_gid: - opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(opts->fs_gid)) - return invalf(fc, "ntfs3: Invalid value for gid."); + opts->fs_gid = result.gid; break; case Opt_umask: if (result.uint_32 & ~07777) diff --git a/fs/open.c b/fs/open.c index 50e45bc7c4d8..22adbef7ecc2 100644 --- a/fs/open.c +++ b/fs/open.c @@ -247,6 +247,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; + loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; @@ -319,8 +320,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; - /* Check for wrap through zero too */ - if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) + /* Check for wraparound */ + if (check_add_overflow(offset, len, &sum)) + return -EFBIG; + + if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) @@ -982,12 +986,11 @@ static int do_dentry_open(struct file *f, */ if (f->f_mode & FMODE_WRITE) { /* - * Paired with smp_mb() in collapse_file() to ensure nr_thps - * is up to date and the update to i_writecount by - * get_write_access() is visible. Ensures subsequent insertion - * of THPs into the page cache will fail. + * Depends on full fence from get_write_access() to synchronize + * against collapse_file() regarding i_writecount and nr_thps + * updates. Ensures subsequent insertion of THPs into the page + * cache will fail. */ - smp_mb(); if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; @@ -1004,11 +1007,6 @@ static int do_dentry_open(struct file *f, } } - /* - * Once we return a file with FMODE_OPENED, __fput() will call - * fsnotify_close(), so we need fsnotify_open() here for symmetry. - */ - fsnotify_open(f); return 0; cleanup_all: @@ -1085,8 +1083,19 @@ EXPORT_SYMBOL(file_path); */ int vfs_open(const struct path *path, struct file *file) { + int ret; + file->f_path = *path; - return do_dentry_open(file, NULL); + ret = do_dentry_open(file, NULL); + if (!ret) { + /* + * Once we return a file with FMODE_OPENED, __fput() will call + * fsnotify_close(), so we need fsnotify_open() here for + * symmetry. + */ + fsnotify_open(file); + } + return ret; } struct file *dentry_open(const struct path *path, int flags, @@ -1177,8 +1186,10 @@ struct file *kernel_file_open(const struct path *path, int flags, error = do_dentry_open(f, NULL); if (error) { fput(f); - f = ERR_PTR(error); + return ERR_PTR(error); } + + fsnotify_open(f); return f; } EXPORT_SYMBOL_GPL(kernel_file_open); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index a7b527ea50d3..26ecda0e4d19 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -471,4 +471,5 @@ static void __exit exit_openprom_fs(void) module_init(init_openprom_fs) module_exit(exit_openprom_fs) +MODULE_DESCRIPTION("OpenPROM filesystem support"); MODULE_LICENSE("GPL"); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 085912268442..fdb9b65db1de 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -56,7 +56,6 @@ static int orangefs_writepage_locked(struct page *page, ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, len, wr, NULL, NULL); if (ret < 0) { - SetPageError(page); mapping_set_error(page->mapping, ret); } else { ret = 0; @@ -119,7 +118,6 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, 0, &wr, NULL, NULL); if (ret < 0) { for (i = 0; i < ow->npages; i++) { - SetPageError(ow->pages[i]); mapping_set_error(ow->pages[i]->mapping, ret); if (PagePrivate(ow->pages[i])) { wrp = (struct orangefs_write_range *) @@ -303,15 +301,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ flush_dcache_folio(folio); - if (ret < 0) { - folio_set_error(folio); - } else { - folio_mark_uptodate(folio); + if (ret > 0) ret = 0; - } - /* unlock the folio after the ->read_folio() routine completes */ - folio_unlock(folio); - return ret; + folio_end_read(folio, ret == 0); + return ret; } static int orangefs_write_begin(struct file *file, diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index b501dc07f922..edcca4beb765 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -274,10 +274,8 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap, gossip_err("orangefs error: asked for %d pages, only got %d.\n", bufmap->page_count, ret); - for (i = 0; i < ret; i++) { - SetPageError(bufmap->page_array[i]); + for (i = 0; i < ret; i++) unpin_user_page(bufmap->page_array[i]); - } return -ENOMEM; } diff --git a/fs/pidfs.c b/fs/pidfs.c index dbb9d854d1c5..c9cb14181def 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -11,10 +11,16 @@ #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/pseudo_fs.h> +#include <linux/ptrace.h> #include <linux/seq_file.h> #include <uapi/linux/pidfd.h> +#include <linux/ipc_namespace.h> +#include <linux/time_namespace.h> +#include <linux/utsname.h> +#include <net/net_namespace.h> #include "internal.h" +#include "mount.h" #ifdef CONFIG_PROC_FS /** @@ -108,11 +114,95 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) return poll_flags; } +static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct task_struct *task __free(put_task) = NULL; + struct nsproxy *nsp __free(put_nsproxy) = NULL; + struct pid *pid = pidfd_pid(file); + struct ns_common *ns_common; + + if (arg) + return -EINVAL; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + scoped_guard(task_lock, task) { + nsp = task->nsproxy; + if (nsp) + get_nsproxy(nsp); + } + if (!nsp) + return -ESRCH; /* just pretend it didn't exist */ + + /* + * We're trying to open a file descriptor to the namespace so perform a + * filesystem cred ptrace check. Also, we mirror nsfs behavior. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + return -EACCES; + + switch (cmd) { + /* Namespaces that hang of nsproxy. */ + case PIDFD_GET_CGROUP_NAMESPACE: + get_cgroup_ns(nsp->cgroup_ns); + ns_common = to_ns_common(nsp->cgroup_ns); + break; + case PIDFD_GET_IPC_NAMESPACE: + get_ipc_ns(nsp->ipc_ns); + ns_common = to_ns_common(nsp->ipc_ns); + break; + case PIDFD_GET_MNT_NAMESPACE: + get_mnt_ns(nsp->mnt_ns); + ns_common = to_ns_common(nsp->mnt_ns); + break; + case PIDFD_GET_NET_NAMESPACE: + ns_common = to_ns_common(nsp->net_ns); + get_net_ns(ns_common); + break; + case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: + get_pid_ns(nsp->pid_ns_for_children); + ns_common = to_ns_common(nsp->pid_ns_for_children); + break; + case PIDFD_GET_TIME_NAMESPACE: + get_time_ns(nsp->time_ns); + ns_common = to_ns_common(nsp->time_ns); + break; + case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: + get_time_ns(nsp->time_ns_for_children); + ns_common = to_ns_common(nsp->time_ns_for_children); + break; + case PIDFD_GET_UTS_NAMESPACE: + get_uts_ns(nsp->uts_ns); + ns_common = to_ns_common(nsp->uts_ns); + break; + /* Namespaces that don't hang of nsproxy. */ + case PIDFD_GET_USER_NAMESPACE: + rcu_read_lock(); + ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); + rcu_read_unlock(); + break; + case PIDFD_GET_PID_NAMESPACE: + rcu_read_lock(); + ns_common = to_ns_common(get_pid_ns(task_active_pid_ns(task))); + rcu_read_unlock(); + break; + default: + return -ENOIOCTLCMD; + } + + /* open_namespace() unconditionally consumes the reference */ + return open_namespace(ns_common); +} + static const struct file_operations pidfs_file_operations = { .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif + .unlocked_ioctl = pidfd_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; struct pid *pidfd_pid(const struct file *file) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 775ce0bcf08c..c02f1e63f82d 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -202,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum) { int i; - i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1, - GFP_KERNEL); + i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST, + GFP_KERNEL); if (i < 0) return i; @@ -213,7 +213,7 @@ int proc_alloc_inum(unsigned int *inum) void proc_free_inum(unsigned int inum) { - ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); + ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 0a808951b7d3..e133b507ddf3 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -61,7 +61,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb) return security_sb_show_options(m, sb); } -static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) +static void show_vfsmnt_opts(struct seq_file *m, struct vfsmount *mnt) { static const struct proc_fs_opts mnt_opts[] = { { MNT_NOSUID, ",nosuid" }, @@ -124,7 +124,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) err = show_sb_opts(m, sb); if (err) goto out; - show_mnt_opts(m, mnt); + show_vfsmnt_opts(m, mnt); if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt_path.dentry); seq_puts(m, " 0 0\n"); @@ -153,7 +153,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) goto out; seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); - show_mnt_opts(m, mnt); + show_vfsmnt_opts(m, mnt); /* Tagged fields ("foo:X" or "bar") */ if (IS_MNT_SHARED(r)) diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index d79841e94428..e399e2dd3a12 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -430,5 +430,6 @@ static void __exit exit_qnx4_fs(void) module_init(init_qnx4_fs) module_exit(exit_qnx4_fs) +MODULE_DESCRIPTION("QNX4 file system"); MODULE_LICENSE("GPL"); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index d62fbef838b6..4f1735b882b1 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -694,4 +694,5 @@ static void __exit exit_qnx6_fs(void) module_init(init_qnx6_fs) module_exit(exit_qnx6_fs) +MODULE_DESCRIPTION("QNX6 file system"); MODULE_LICENSE("GPL"); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 627eb2f72ef3..a2b256dac36e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2246,9 +2246,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) int cnt; struct quota_info *dqopt = sb_dqopt(sb); - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); + rwsem_assert_held_write(&sb->s_umount); /* Cannot turn off usage accounting without turning off limits, or * suspend quotas and simultaneously turn quotas off. */ @@ -2510,9 +2508,7 @@ int dquot_resume(struct super_block *sb, int type) int ret = 0, cnt; unsigned int flags; - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); + rwsem_assert_held_write(&sb->s_umount); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) diff --git a/fs/read_write.c b/fs/read_write.c index ef6339391351..90e283b31ca1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, ssize_t ret; init_sync_kiocb(&kiocb, filp); - ret = kiocb_set_rw_flags(&kiocb, flags); + ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); @@ -1736,3 +1736,19 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } + +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) +{ + size_t len = iov_iter_count(iter); + + if (!iter_is_ubuf(iter)) + return false; + + if (!is_power_of_2(len)) + return false; + + if (!IS_ALIGNED(pos, len)) + return false; + + return true; +} diff --git a/fs/readdir.c b/fs/readdir.c index 278bc0254732..d6c82421902a 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -22,8 +22,6 @@ #include <linux/compat.h> #include <linux/uaccess.h> -#include <asm/unaligned.h> - /* * Some filesystems were never converted to '->iterate_shared()' * and their directory iterators want the inode lock held for @@ -72,7 +70,7 @@ int wrap_directory_iterator(struct file *file, EXPORT_SYMBOL(wrap_directory_iterator); /* - * Note the "unsafe_put_user() semantics: we goto a + * Note the "unsafe_put_user()" semantics: we goto a * label for errors. */ #define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index c1daedc50f4c..9b43a81a6488 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2699,7 +2699,6 @@ fail: } bh = bh->b_this_page; } while (bh != head); - folio_set_error(folio); BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); folio_unlock(folio); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 2cbb92462074..68758b6fed94 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -101,19 +101,15 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); */ static int romfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; loff_t offset, size; unsigned long fillsize, pos; void *buf; int ret; - buf = kmap(page); - if (!buf) - return -ENOMEM; + buf = kmap_local_folio(folio, 0); - /* 32 bit warning -- but not for us :) */ - offset = page_offset(page); + offset = folio_pos(folio); size = i_size_read(inode); fillsize = 0; ret = 0; @@ -125,20 +121,14 @@ static int romfs_read_folio(struct file *file, struct folio *folio) ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); if (ret < 0) { - SetPageError(page); fillsize = 0; ret = -EIO; } } - if (fillsize < PAGE_SIZE) - memset(buf + fillsize, 0, PAGE_SIZE - fillsize); - if (ret == 0) - SetPageUptodate(page); - - flush_dcache_page(page); - kunmap(page); - unlock_page(page); + buf = folio_zero_tail(folio, fillsize, buf); + kunmap_local(buf); + folio_end_read(folio, ret == 0); return ret; } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 557b68e99d0a..a865941724c0 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1918,8 +1918,8 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ -#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) -#define CIFSSEC_MAX (CIFSSEC_MUST_NTLMV2) +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP | CIFSSEC_MAY_SEAL) +#define CIFSSEC_MAX (CIFSSEC_MAY_SIGN | CIFSSEC_MUST_KRB5 | CIFSSEC_MAY_SEAL) #define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) /* ***************************************************************** diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index f1f2573bb18d..1374635e89fa 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -246,35 +246,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) } /* - * Expand the size of a readahead to the size of the rsize, if at least as - * large as a page, allowing for the possibility that rsize is not pow-2 - * aligned. - */ -static void cifs_expand_readahead(struct netfs_io_request *rreq) -{ - unsigned int rsize = rreq->rsize; - loff_t misalignment, i_size = i_size_read(rreq->inode); - - if (rsize < PAGE_SIZE) - return; - - if (rsize < INT_MAX) - rsize = roundup_pow_of_two(rsize); - else - rsize = ((unsigned int)INT_MAX + 1) / 2; - - misalignment = rreq->start & (rsize - 1); - if (misalignment) { - rreq->start -= misalignment; - rreq->len += misalignment; - } - - rreq->len = round_up(rreq->len, rsize); - if (rreq->start < i_size && rreq->len > i_size - rreq->start) - rreq->len = i_size - rreq->start; -} - -/* * Completion of a request operation. */ static void cifs_rreq_done(struct netfs_io_request *rreq) @@ -329,7 +300,6 @@ const struct netfs_request_ops cifs_req_ops = { .init_request = cifs_init_request, .free_request = cifs_free_request, .free_subrequest = cifs_free_subrequest, - .expand_readahead = cifs_expand_readahead, .clamp_length = cifs_clamp_length, .issue_read = cifs_req_issue_read, .done = cifs_rreq_done, diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 3bbac925d076..bc926ab2555b 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -128,12 +128,14 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag("compress", Opt_compress), fsparam_flag("witness", Opt_witness), + /* Mount options which take uid or gid */ + fsparam_uid("backupuid", Opt_backupuid), + fsparam_gid("backupgid", Opt_backupgid), + fsparam_uid("uid", Opt_uid), + fsparam_uid("cruid", Opt_cruid), + fsparam_gid("gid", Opt_gid), + /* Mount options which take numeric value */ - fsparam_u32("backupuid", Opt_backupuid), - fsparam_u32("backupgid", Opt_backupgid), - fsparam_u32("uid", Opt_uid), - fsparam_u32("cruid", Opt_cruid), - fsparam_u32("gid", Opt_gid), fsparam_u32("file_mode", Opt_file_mode), fsparam_u32("dirmode", Opt_dirmode), fsparam_u32("dir_mode", Opt_dirmode), @@ -951,8 +953,6 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, int i, opt; bool is_smb3 = !strcmp(fc->fs_type->name, "smb3"); bool skip_parsing = false; - kuid_t uid; - kgid_t gid; cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key); @@ -1083,38 +1083,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } break; case Opt_uid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - goto cifs_parse_mount_err; - ctx->linux_uid = uid; + ctx->linux_uid = result.uid; ctx->uid_specified = true; break; case Opt_cruid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - goto cifs_parse_mount_err; - ctx->cred_uid = uid; + ctx->cred_uid = result.uid; ctx->cruid_specified = true; break; case Opt_backupuid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - goto cifs_parse_mount_err; - ctx->backupuid = uid; + ctx->backupuid = result.uid; ctx->backupuid_specified = true; break; case Opt_backupgid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) - goto cifs_parse_mount_err; - ctx->backupgid = gid; + ctx->backupgid = result.gid; ctx->backupgid_specified = true; break; case Opt_gid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) - goto cifs_parse_mount_err; - ctx->linux_gid = gid; + ctx->linux_gid = result.gid; ctx->gid_specified = true; break; case Opt_port: diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 8d10be1fe18a..c3ee42188d25 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -917,6 +917,40 @@ struct smb2_query_directory_rsp { __u8 Buffer[]; } __packed; +/* DeviceType Flags */ +#define FILE_DEVICE_CD_ROM 0x00000002 +#define FILE_DEVICE_CD_ROM_FILE_SYSTEM 0x00000003 +#define FILE_DEVICE_DFS 0x00000006 +#define FILE_DEVICE_DISK 0x00000007 +#define FILE_DEVICE_DISK_FILE_SYSTEM 0x00000008 +#define FILE_DEVICE_FILE_SYSTEM 0x00000009 +#define FILE_DEVICE_NAMED_PIPE 0x00000011 +#define FILE_DEVICE_NETWORK 0x00000012 +#define FILE_DEVICE_NETWORK_FILE_SYSTEM 0x00000014 +#define FILE_DEVICE_NULL 0x00000015 +#define FILE_DEVICE_PARALLEL_PORT 0x00000016 +#define FILE_DEVICE_PRINTER 0x00000018 +#define FILE_DEVICE_SERIAL_PORT 0x0000001b +#define FILE_DEVICE_STREAMS 0x0000001e +#define FILE_DEVICE_TAPE 0x0000001f +#define FILE_DEVICE_TAPE_FILE_SYSTEM 0x00000020 +#define FILE_DEVICE_VIRTUAL_DISK 0x00000024 +#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028 + +/* Device Characteristics */ +#define FILE_REMOVABLE_MEDIA 0x00000001 +#define FILE_READ_ONLY_DEVICE 0x00000002 +#define FILE_FLOPPY_DISKETTE 0x00000004 +#define FILE_WRITE_ONCE_MEDIA 0x00000008 +#define FILE_REMOTE_DEVICE 0x00000010 +#define FILE_DEVICE_IS_MOUNTED 0x00000020 +#define FILE_VIRTUAL_VOLUME 0x00000040 +#define FILE_DEVICE_SECURE_OPEN 0x00000100 +#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000 +#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000 +#define FILE_PORTABLE_DEVICE 0x00004000 +#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000 + /* * Maximum number of iovs we need for a set-info request. * The largest one is rename/hardlink diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index e7e07891781b..840c71c66b30 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2051,15 +2051,22 @@ out_err1: * @access: file access flags * @disposition: file disposition flags * @may_flags: set with MAY_ flags + * @is_dir: is creating open flags for directory * * Return: file open flags */ static int smb2_create_open_flags(bool file_present, __le32 access, __le32 disposition, - int *may_flags) + int *may_flags, + bool is_dir) { int oflags = O_NONBLOCK | O_LARGEFILE; + if (is_dir) { + access &= ~FILE_WRITE_DESIRE_ACCESS_LE; + ksmbd_debug(SMB, "Discard write access to a directory\n"); + } + if (access & FILE_READ_DESIRED_ACCESS_LE && access & FILE_WRITE_DESIRE_ACCESS_LE) { oflags |= O_RDWR; @@ -3167,7 +3174,9 @@ int smb2_open(struct ksmbd_work *work) open_flags = smb2_create_open_flags(file_present, daccess, req->CreateDisposition, - &may_flags); + &may_flags, + req->CreateOptions & FILE_DIRECTORY_FILE_LE || + (file_present && S_ISDIR(d_inode(path.dentry)->i_mode))); if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { if (open_flags & (O_CREAT | O_TRUNC)) { @@ -5314,8 +5323,13 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info = (struct filesystem_device_info *)rsp->Buffer; - info->DeviceType = cpu_to_le32(stfs.f_type); - info->DeviceCharacteristics = cpu_to_le32(0x00000020); + info->DeviceType = cpu_to_le32(FILE_DEVICE_DISK); + info->DeviceCharacteristics = + cpu_to_le32(FILE_DEVICE_IS_MOUNTED); + if (!test_tree_conn_flag(work->tcon, + KSMBD_TREE_CONN_FLAG_WRITABLE)) + info->DeviceCharacteristics |= + cpu_to_le32(FILE_READ_ONLY_DEVICE); rsp->OutputBufferLength = cpu_to_le32(8); break; } diff --git a/fs/stat.c b/fs/stat.c index 70bd3e888cfa..89ce1be56310 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -90,6 +90,37 @@ void generic_fill_statx_attr(struct inode *inode, struct kstat *stat) EXPORT_SYMBOL(generic_fill_statx_attr); /** + * generic_fill_statx_atomic_writes - Fill in atomic writes statx attributes + * @stat: Where to fill in the attribute flags + * @unit_min: Minimum supported atomic write length in bytes + * @unit_max: Maximum supported atomic write length in bytes + * + * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from + * atomic write unit_min and unit_max values. + */ +void generic_fill_statx_atomic_writes(struct kstat *stat, + unsigned int unit_min, + unsigned int unit_max) +{ + /* Confirm that the request type is known */ + stat->result_mask |= STATX_WRITE_ATOMIC; + + /* Confirm that the file attribute type is known */ + stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC; + + if (unit_min) { + stat->atomic_write_unit_min = unit_min; + stat->atomic_write_unit_max = unit_max; + /* Initially only allow 1x segment */ + stat->atomic_write_segments_max = 1; + + /* Confirm atomic writes are actually supported */ + stat->attributes |= STATX_ATTR_WRITE_ATOMIC; + } +} +EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes); + +/** * vfs_getattr_nosec - getattr without security checks * @path: file to get attributes from * @stat: structure to return attributes in @@ -214,6 +245,43 @@ int getname_statx_lookup_flags(int flags) return lookup_flags; } +static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, + u32 request_mask) +{ + int error = vfs_getattr(path, stat, request_mask, flags); + + if (request_mask & STATX_MNT_ID_UNIQUE) { + stat->mnt_id = real_mount(path->mnt)->mnt_id_unique; + stat->result_mask |= STATX_MNT_ID_UNIQUE; + } else { + stat->mnt_id = real_mount(path->mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + } + + if (path_mounted(path)) + stat->attributes |= STATX_ATTR_MOUNT_ROOT; + stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; + + /* + * If this is a block device inode, override the filesystem + * attributes with the block device specific parameters that need to be + * obtained from the bdev backing inode. + */ + if (S_ISBLK(stat->mode)) + bdev_statx(path, stat, request_mask); + + return error; +} + +static int vfs_statx_fd(int fd, int flags, struct kstat *stat, + u32 request_mask) +{ + CLASS(fd_raw, f)(fd); + if (!f.file) + return -EBADF; + return vfs_statx_path(&f.file->f_path, flags, stat, request_mask); +} + /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename @@ -243,36 +311,13 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) - goto out; - - error = vfs_getattr(&path, stat, request_mask, flags); - - if (request_mask & STATX_MNT_ID_UNIQUE) { - stat->mnt_id = real_mount(path.mnt)->mnt_id_unique; - stat->result_mask |= STATX_MNT_ID_UNIQUE; - } else { - stat->mnt_id = real_mount(path.mnt)->mnt_id; - stat->result_mask |= STATX_MNT_ID; - } - - if (path.mnt->mnt_root == path.dentry) - stat->attributes |= STATX_ATTR_MOUNT_ROOT; - stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; - - /* Handle STATX_DIOALIGN for block devices. */ - if (request_mask & STATX_DIOALIGN) { - struct inode *inode = d_backing_inode(path.dentry); - - if (S_ISBLK(inode->i_mode)) - bdev_statx_dioalign(inode, stat); - } - + return error; + error = vfs_statx_path(&path, flags, stat, request_mask); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } -out: return error; } @@ -289,18 +334,10 @@ int vfs_fstatat(int dfd, const char __user *filename, * If AT_EMPTY_PATH is set, we expect the common case to be that * empty path, and avoid doing all the extra pathname work. */ - if (dfd >= 0 && flags == AT_EMPTY_PATH) { - char c; - - ret = get_user(c, filename); - if (unlikely(ret)) - return ret; + if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) + return vfs_fstat(dfd, stat); - if (likely(!c)) - return vfs_fstat(dfd, stat); - } - - name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL); + name = getname_flags(filename, getname_statx_lookup_flags(statx_flags)); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); putname(name); @@ -488,34 +525,39 @@ static int do_readlinkat(int dfd, const char __user *pathname, char __user *buf, int bufsiz) { struct path path; + struct filename *name; int error; - int empty = 0; unsigned int lookup_flags = LOOKUP_EMPTY; if (bufsiz <= 0) return -EINVAL; retry: - error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); - if (!error) { - struct inode *inode = d_backing_inode(path.dentry); - - error = empty ? -ENOENT : -EINVAL; - /* - * AFS mountpoints allow readlink(2) but are not symlinks - */ - if (d_is_symlink(path.dentry) || inode->i_op->readlink) { - error = security_inode_readlink(path.dentry); - if (!error) { - touch_atime(&path); - error = vfs_readlink(path.dentry, buf, bufsiz); - } - } - path_put(&path); - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; + name = getname_flags(pathname, lookup_flags); + error = filename_lookup(dfd, name, lookup_flags, &path, NULL); + if (unlikely(error)) { + putname(name); + return error; + } + + /* + * AFS mountpoints allow readlink(2) but are not symlinks + */ + if (d_is_symlink(path.dentry) || + d_backing_inode(path.dentry)->i_op->readlink) { + error = security_inode_readlink(path.dentry); + if (!error) { + touch_atime(&path); + error = vfs_readlink(path.dentry, buf, bufsiz); } + } else { + error = (name->name[0] == '\0') ? -ENOENT : -EINVAL; + } + path_put(&path); + putname(name); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; } return error; } @@ -659,6 +701,9 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_dio_mem_align = stat->dio_mem_align; tmp.stx_dio_offset_align = stat->dio_offset_align; tmp.stx_subvol = stat->subvol; + tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min; + tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max; + tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } @@ -674,7 +719,8 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; - /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + /* + * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests * from userland. */ mask &= ~STATX_CHANGE_COOKIE; @@ -686,16 +732,41 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, return cp_statx(&stat, buffer); } +int do_statx_fd(int fd, unsigned int flags, unsigned int mask, + struct statx __user *buffer) +{ + struct kstat stat; + int error; + + if (mask & STATX__RESERVED) + return -EINVAL; + if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) + return -EINVAL; + + /* + * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + * from userland. + */ + mask &= ~STATX_CHANGE_COOKIE; + + error = vfs_statx_fd(fd, flags, &stat, mask); + if (error) + return error; + + return cp_statx(&stat, buffer); +} + /** * sys_statx - System call to get enhanced stats * @dfd: Base directory to pathwalk from *or* fd to stat. - * @filename: File to stat or "" with AT_EMPTY_PATH + * @filename: File to stat or either NULL or "" with AT_EMPTY_PATH * @flags: AT_* flags to control pathwalk. * @mask: Parts of statx struct actually required. * @buffer: Result buffer. * * Note that fstat() can be emulated by setting dfd to the fd of interest, - * supplying "" as the filename and setting AT_EMPTY_PATH in the flags. + * supplying "" (or preferably NULL) as the filename and setting AT_EMPTY_PATH + * in the flags. */ SYSCALL_DEFINE5(statx, int, dfd, const char __user *, filename, unsigned, flags, @@ -703,9 +774,24 @@ SYSCALL_DEFINE5(statx, struct statx __user *, buffer) { int ret; + unsigned lflags; struct filename *name; - name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL); + /* + * Short-circuit handling of NULL and "" paths. + * + * For a NULL path we require and accept only the AT_EMPTY_PATH flag + * (possibly |'d with AT_STATX flags). + * + * However, glibc on 32-bit architectures implements fstatat as statx + * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags. + * Supporting this results in the uglification below. + */ + lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE); + if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) + return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer); + + name = getname_flags(filename, getname_statx_lookup_flags(flags)); ret = do_statx(dfd, name, flags, mask, buffer); putname(name); diff --git a/fs/super.c b/fs/super.c index b72f1d288e95..095ba793e10c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1502,8 +1502,17 @@ static int fs_bdev_thaw(struct block_device *bdev) lockdep_assert_held(&bdev->bd_fsfreeze_mutex); + /* + * The block device may have been frozen before it was claimed by a + * filesystem. Concurrently another process might try to mount that + * frozen block device and has temporarily claimed the block device for + * that purpose causing a concurrent fs_bdev_thaw() to end up here. The + * mounter is already about to abort mounting because they still saw an + * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return + * NULL in that case. + */ sb = get_bdev_super(bdev); - if (WARN_ON_ONCE(!sb)) + if (!sb) return -EINVAL; if (sb->s_op->thaw_super) diff --git a/fs/sysv/super.c b/fs/sysv/super.c index 3365a30dc1e0..5c0d07ddbda2 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -591,4 +591,5 @@ static void __exit exit_sysv_fs(void) module_init(init_sysv_fs) module_exit(exit_sysv_fs) +MODULE_DESCRIPTION("SystemV Filesystem"); MODULE_LICENSE("GPL"); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 7c29f4afc23d..1028ab6d9a74 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -296,9 +296,9 @@ enum { }; static const struct fs_parameter_spec tracefs_param_specs[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), {} }; @@ -306,8 +306,6 @@ static int tracefs_parse_param(struct fs_context *fc, struct fs_parameter *param { struct tracefs_fs_info *opts = fc->s_fs_info; struct fs_parse_result result; - kuid_t uid; - kgid_t gid; int opt; opt = fs_parse(fc, tracefs_param_specs, param, &result); @@ -316,16 +314,10 @@ static int tracefs_parse_param(struct fs_context *fc, struct fs_parameter *param switch (opt) { case Opt_uid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - return invalf(fc, "Unknown uid"); - opts->uid = uid; + opts->uid = result.uid; break; case Opt_gid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) - return invalf(fc, "Unknown gid"); - opts->gid = gid; + opts->gid = result.gid; break; case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 27c85d92d1dc..61f25d3cf3f7 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -188,7 +188,6 @@ Eend: "offset=%lu", dir->i_ino, (page->index<<PAGE_SHIFT)+offs); fail: - SetPageError(page); return false; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index eee7320ab0b0..17e409ceaa33 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2057,7 +2057,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, goto out; features = uffdio_api.features; ret = -EINVAL; - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + if (uffdio_api.api != UFFD_API) goto err_out; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) @@ -2081,6 +2081,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; #endif + + ret = -EINVAL; + if (features & ~uffdio_api.features) + goto err_out; + uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index 118dedef8ebe..fdb4da24d662 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -228,26 +228,19 @@ const struct inode_operations vboxsf_reg_iops = { static int vboxsf_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct vboxsf_handle *sf_handle = file->private_data; - loff_t off = page_offset(page); + loff_t off = folio_pos(folio); u32 nread = PAGE_SIZE; u8 *buf; int err; - buf = kmap(page); + buf = kmap_local_folio(folio, 0); err = vboxsf_read(sf_handle->root, sf_handle->handle, off, &nread, buf); - if (err == 0) { - memset(&buf[nread], 0, PAGE_SIZE - nread); - flush_dcache_page(page); - SetPageUptodate(page); - } else { - SetPageError(page); - } + buf = folio_zero_tail(folio, nread, buf + nread); - kunmap(page); - unlock_page(page); + kunmap_local(buf); + folio_end_read(folio, err == 0); return err; } @@ -295,7 +288,6 @@ static int vboxsf_writepage(struct page *page, struct writeback_control *wbc) kref_put(&sf_handle->refcount, vboxsf_handle_release); if (err == 0) { - ClearPageError(page); /* mtime changed */ sf_i->force_restat = 1; } else { diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index ffb1d565da39..e95b8a48d8a0 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -41,8 +41,8 @@ enum { opt_nls, opt_uid, opt_gid, opt_ttl, opt_dmode, opt_fmode, static const struct fs_parameter_spec vboxsf_fs_parameters[] = { fsparam_string ("nls", opt_nls), - fsparam_u32 ("uid", opt_uid), - fsparam_u32 ("gid", opt_gid), + fsparam_uid ("uid", opt_uid), + fsparam_gid ("gid", opt_gid), fsparam_u32 ("ttl", opt_ttl), fsparam_u32oct ("dmode", opt_dmode), fsparam_u32oct ("fmode", opt_fmode), @@ -55,8 +55,6 @@ static int vboxsf_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct vboxsf_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - kuid_t uid; - kgid_t gid; int opt; opt = fs_parse(fc, vboxsf_fs_parameters, param, &result); @@ -73,16 +71,10 @@ static int vboxsf_parse_param(struct fs_context *fc, struct fs_parameter *param) param->string = NULL; break; case opt_uid: - uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(uid)) - return -EINVAL; - ctx->o.uid = uid; + ctx->o.uid = result.uid; break; case opt_gid: - gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(gid)) - return -EINVAL; - ctx->o.gid = gid; + ctx->o.gid = result.gid; break; case opt_ttl: ctx->o.ttl = msecs_to_jiffies(result.uint_32); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9967334ea99f..cf629302d48e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -86,9 +86,8 @@ xfs_inode_alloc( return NULL; } - /* VFS doesn't initialise i_mode or i_state! */ + /* VFS doesn't initialise i_mode! */ VFS_I(ip)->i_mode = 0; - VFS_I(ip)->i_state = 0; mapping_set_large_folios(VFS_I(ip)->i_mapping); XFS_STATS_INC(mp, vn_active); @@ -314,6 +313,7 @@ xfs_reinit_inode( dev_t dev = inode->i_rdev; kuid_t uid = inode->i_uid; kgid_t gid = inode->i_gid; + unsigned long state = inode->i_state; error = inode_init_always(mp->m_super, inode); @@ -324,6 +324,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; + inode->i_state = state; mapping_set_large_folios(inode->i_mapping); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ff222827e550..a00dcbc77e12 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -17,6 +17,8 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_bmap_btree.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" @@ -811,6 +813,7 @@ xfs_setattr_size( struct xfs_trans *tp; int error; uint lock_flags = 0; + uint resblks = 0; bool did_zeroing = false; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); @@ -917,7 +920,17 @@ xfs_setattr_size( return error; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + /* + * For realtime inode with more than one block rtextsize, we need the + * block reservation for bmap btree block allocations/splits that can + * happen since it could split the tail written extent and convert the + * right beyond EOF one to unwritten. + */ + if (xfs_inode_has_bigrtalloc(ip)) + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, + 0, 0, &tp); if (error) return error; diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index b20fa25a7e8d..052e5c98c105 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -46,6 +46,7 @@ mandatory-y += pci.h mandatory-y += percpu.h mandatory-y += pgalloc.h mandatory-y += preempt.h +mandatory-y += runtime-const.h mandatory-y += rwonce.h mandatory-y += sections.h mandatory-y += serial.h diff --git a/include/asm-generic/runtime-const.h b/include/asm-generic/runtime-const.h new file mode 100644 index 000000000000..670499459514 --- /dev/null +++ b/include/asm-generic/runtime-const.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RUNTIME_CONST_H +#define _ASM_RUNTIME_CONST_H + +/* + * This is the fallback for when the architecture doesn't + * support the runtime const operations. + * + * We just use the actual symbols as-is. + */ +#define runtime_const_ptr(sym) (sym) +#define runtime_const_shift_right_32(val, sym) ((u32)(val)>>(sym)) +#define runtime_const_init(type,sym) do { } while (0) + +#endif diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5703526d6ebf..389a78415b9b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -944,6 +944,14 @@ #define CON_INITCALL \ BOUNDED_SECTION_POST_LABEL(.con_initcall.init, __con_initcall, _start, _end) +#define RUNTIME_NAME(t,x) runtime_##t##_##x + +#define RUNTIME_CONST(t,x) \ + . = ALIGN(8); \ + RUNTIME_NAME(t,x) : AT(ADDR(RUNTIME_NAME(t,x)) - LOAD_OFFSET) { \ + *(RUNTIME_NAME(t,x)); \ + } + /* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ #define KUNIT_TABLE() \ . = ALIGN(8); \ diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 7428cb43952d..804f856ed3e5 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -7,51 +7,38 @@ struct request; enum blk_integrity_flags { - BLK_INTEGRITY_VERIFY = 1 << 0, - BLK_INTEGRITY_GENERATE = 1 << 1, + BLK_INTEGRITY_NOVERIFY = 1 << 0, + BLK_INTEGRITY_NOGENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, - BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, + BLK_INTEGRITY_REF_TAG = 1 << 3, + BLK_INTEGRITY_STACKED = 1 << 4, }; -struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - unsigned char tuple_size; - unsigned char pi_offset; - const char *disk_name; -}; - -typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); -typedef void (integrity_prepare_fn) (struct request *); -typedef void (integrity_complete_fn) (struct request *, unsigned int); - -struct blk_integrity_profile { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - integrity_prepare_fn *prepare_fn; - integrity_complete_fn *complete_fn; - const char *name; -}; +const char *blk_integrity_profile_name(struct blk_integrity *bi); +bool queue_limits_stack_integrity(struct queue_limits *t, + struct queue_limits *b); +static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, + struct block_device *bdev) +{ + return queue_limits_stack_integrity(t, &bdev->bd_disk->queue->limits); +} #ifdef CONFIG_BLK_DEV_INTEGRITY -void blk_integrity_register(struct gendisk *, struct blk_integrity *); -void blk_integrity_unregister(struct gendisk *); -int blk_integrity_compare(struct gendisk *, struct gendisk *); int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) { - struct blk_integrity *bi = &disk->queue->integrity; + return q->limits.integrity.tuple_size; +} - if (!bi->profile) +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +{ + if (!blk_integrity_queue_supports_integrity(disk->queue)) return NULL; - - return bi; + return &disk->queue->limits.integrity; } static inline struct blk_integrity * @@ -60,12 +47,6 @@ bdev_get_integrity(struct block_device *bdev) return blk_get_integrity(bdev->bd_disk); } -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return q->integrity.profile; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -100,14 +81,13 @@ static inline bool blk_integrity_rq(struct request *rq) } /* - * Return the first bvec that contains integrity data. Only drivers that are - * limited to a single integrity segment should use this helper. + * Return the current bvec that contains the integrity data. bip_iter may be + * advanced to iterate over the integrity data. */ -static inline struct bio_vec *rq_integrity_vec(struct request *rq) +static inline struct bio_vec rq_integrity_vec(struct request *rq) { - if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) - return NULL; - return rq->bio->bi_integrity->bip_vec; + return mp_bvec_iter_bvec(rq->bio->bi_integrity->bip_vec, + rq->bio->bi_integrity->bip_iter); } #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline int blk_rq_count_integrity_sg(struct request_queue *q, @@ -134,17 +114,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) { return false; } -static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) -{ - return 0; -} -static inline void blk_integrity_register(struct gendisk *d, - struct blk_integrity *b) -{ -} -static inline void blk_integrity_unregister(struct gendisk *d) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -167,9 +136,11 @@ static inline int blk_integrity_rq(struct request *rq) return 0; } -static inline struct bio_vec *rq_integrity_vec(struct request *rq) +static inline struct bio_vec rq_integrity_vec(struct request *rq) { - return NULL; + /* the optimizer will remove all calls to this function */ + return (struct bio_vec){ }; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + #endif /* _LINUX_BLK_INTEGRITY_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 781c4500491b..632edd71f8c6 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -162,6 +162,11 @@ typedef u16 blk_short_t; */ #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) +/* + * Invalid size or alignment. + */ +#define BLK_STS_INVAL ((__force blk_status_t)19) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -370,7 +375,7 @@ enum req_flag_bits { __REQ_SWAP, /* swap I/O */ __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ - + __REQ_ATOMIC, /* for atomic write operations */ /* * Command specific flags, keep last: */ @@ -402,6 +407,7 @@ enum req_flag_bits { #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) +#define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 24c36929920b..b8196e219ac2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -105,9 +105,16 @@ enum { struct disk_events; struct badblocks; +enum blk_integrity_checksum { + BLK_INTEGRITY_CSUM_NONE = 0, + BLK_INTEGRITY_CSUM_IP = 1, + BLK_INTEGRITY_CSUM_CRC = 2, + BLK_INTEGRITY_CSUM_CRC64 = 3, +} __packed ; + struct blk_integrity { - const struct blk_integrity_profile *profile; unsigned char flags; + enum blk_integrity_checksum csum_type; unsigned char tuple_size; unsigned char pi_offset; unsigned char interval_exp; @@ -261,6 +268,7 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +/* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) @@ -275,17 +283,75 @@ static inline bool blk_op_is_passthrough(blk_opf_t op) return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } +/* flags set by the driver in queue_limits.features */ +typedef unsigned int __bitwise blk_features_t; + +/* supports a volatile write cache */ +#define BLK_FEAT_WRITE_CACHE ((__force blk_features_t)(1u << 0)) + +/* supports passing on the FUA bit */ +#define BLK_FEAT_FUA ((__force blk_features_t)(1u << 1)) + +/* rotational device (hard drive or floppy) */ +#define BLK_FEAT_ROTATIONAL ((__force blk_features_t)(1u << 2)) + +/* contributes to the random number pool */ +#define BLK_FEAT_ADD_RANDOM ((__force blk_features_t)(1u << 3)) + +/* do disk/partitions IO accounting */ +#define BLK_FEAT_IO_STAT ((__force blk_features_t)(1u << 4)) + +/* don't modify data until writeback is done */ +#define BLK_FEAT_STABLE_WRITES ((__force blk_features_t)(1u << 5)) + +/* always completes in submit context */ +#define BLK_FEAT_SYNCHRONOUS ((__force blk_features_t)(1u << 6)) + +/* supports REQ_NOWAIT */ +#define BLK_FEAT_NOWAIT ((__force blk_features_t)(1u << 7)) + +/* supports DAX */ +#define BLK_FEAT_DAX ((__force blk_features_t)(1u << 8)) + +/* supports I/O polling */ +#define BLK_FEAT_POLL ((__force blk_features_t)(1u << 9)) + +/* is a zoned device */ +#define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10)) + +/* supports PCI(e) p2p requests */ +#define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12)) + +/* skip this queue in blk_mq_(un)quiesce_tagset */ +#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) + +/* bounce all highmem pages */ +#define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14)) + +/* undocumented magic for bcache */ +#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ + ((__force blk_features_t)(1u << 15)) + /* - * BLK_BOUNCE_NONE: never bounce (default) - * BLK_BOUNCE_HIGH: bounce all highmem pages + * Flags automatically inherited when stacking limits. */ -enum blk_bounce { - BLK_BOUNCE_NONE, - BLK_BOUNCE_HIGH, -}; +#define BLK_FEAT_INHERIT_MASK \ + (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \ + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) + +/* internal flags in queue_limits.flags */ +typedef unsigned int __bitwise blk_flags_t; + +/* do not send FLUSH/FUA commands despite advertising a write cache */ +#define BLK_FLAG_WRITE_CACHE_DISABLED ((__force blk_flags_t)(1u << 0)) + +/* I/O topology is misaligned */ +#define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1)) struct queue_limits { - enum blk_bounce bounce; + blk_features_t features; + blk_flags_t flags; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; @@ -310,14 +376,20 @@ struct queue_limits { unsigned int discard_alignment; unsigned int zone_write_granularity; + /* atomic write limits */ + unsigned int atomic_write_hw_max; + unsigned int atomic_write_max_sectors; + unsigned int atomic_write_hw_boundary; + unsigned int atomic_write_boundary_sectors; + unsigned int atomic_write_hw_unit_min; + unsigned int atomic_write_unit_min; + unsigned int atomic_write_hw_unit_max; + unsigned int atomic_write_unit_max; + unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; - unsigned char misaligned; - unsigned char discard_misaligned; - unsigned char raid_partial_stripes_expensive; - bool zoned; unsigned int max_open_zones; unsigned int max_active_zones; @@ -327,13 +399,14 @@ struct queue_limits { * due to possible offsets. */ unsigned int dma_alignment; + unsigned int dma_pad_mask; + + struct blk_integrity integrity; }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); -void disk_set_zoned(struct gendisk *disk); - #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -414,10 +487,6 @@ struct request_queue { struct queue_limits limits; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct blk_integrity integrity; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #ifdef CONFIG_PM struct device *dev; enum rpm_status rpm_status; @@ -439,8 +508,6 @@ struct request_queue { */ int id; - unsigned int dma_pad_mask; - /* * queue settings */ @@ -526,38 +593,20 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ -#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ -#define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ -#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ -#define QUEUE_FLAG_HW_WC 13 /* Write back caching supported */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ -#define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ -#define QUEUE_FLAG_WC 17 /* Write back caching */ -#define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ -#define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ -#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ -#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ -#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ -#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ -#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ - (1UL << QUEUE_FLAG_SAME_COMP) | \ - (1UL << QUEUE_FLAG_NOWAIT)) +#define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) @@ -565,16 +614,10 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) -#define blk_queue_stable_writes(q) \ - test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) -#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) -#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_zone_resetall(q) \ - test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) -#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -#define blk_queue_pci_p2pdma(q) \ - test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) +#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) +#define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) +#define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) +#define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME #define blk_queue_rq_alloc_time(q) \ test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) @@ -590,7 +633,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) #define blk_queue_skip_tagset_quiesce(q) \ - test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags) + ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); @@ -620,16 +663,26 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q) static inline bool blk_queue_is_zoned(struct request_queue *q) { - return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned; + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (q->limits.features & BLK_FEAT_ZONED); } #ifdef CONFIG_BLK_DEV_ZONED -unsigned int bdev_nr_zones(struct block_device *bdev); - static inline unsigned int disk_nr_zones(struct gendisk *disk) { - return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0; + return disk->nr_zones; +} +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); +#else /* CONFIG_BLK_DEV_ZONED */ +static inline unsigned int disk_nr_zones(struct gendisk *disk) +{ + return 0; } +static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + return false; +} +#endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { @@ -638,16 +691,9 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline void disk_set_max_open_zones(struct gendisk *disk, - unsigned int max_open_zones) -{ - disk->queue->limits.max_open_zones = max_open_zones; -} - -static inline void disk_set_max_active_zones(struct gendisk *disk, - unsigned int max_active_zones) +static inline unsigned int bdev_nr_zones(struct block_device *bdev) { - disk->queue->limits.max_active_zones = max_active_zones; + return disk_nr_zones(bdev->bd_disk); } static inline unsigned int bdev_max_open_zones(struct block_device *bdev) @@ -660,36 +706,6 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) return bdev->bd_disk->queue->limits.max_active_zones; } -bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); -#else /* CONFIG_BLK_DEV_ZONED */ -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - return 0; -} - -static inline unsigned int disk_nr_zones(struct gendisk *disk) -{ - return 0; -} -static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) -{ - return 0; -} -static inline unsigned int bdev_max_open_zones(struct block_device *bdev) -{ - return 0; -} - -static inline unsigned int bdev_max_active_zones(struct block_device *bdev) -{ - return 0; -} -static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) -{ - return false; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) @@ -880,14 +896,15 @@ static inline bool bio_straddles_zones(struct bio *bio) } /* - * Return how much of the chunk is left to be used for I/O at a given offset. + * Return how much within the boundary is left to be used for I/O at a given + * offset. */ -static inline unsigned int blk_chunk_sectors_left(sector_t offset, - unsigned int chunk_sectors) +static inline unsigned int blk_boundary_sectors_left(sector_t offset, + unsigned int boundary_sectors) { - if (unlikely(!is_power_of_2(chunk_sectors))) - return chunk_sectors - sector_div(offset, chunk_sectors); - return chunk_sectors - (offset & (chunk_sectors - 1)); + if (unlikely(!is_power_of_2(boundary_sectors))) + return boundary_sectors - sector_div(offset, boundary_sectors); + return boundary_sectors - (offset & (boundary_sectors - 1)); } /** @@ -904,7 +921,6 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset, */ static inline struct queue_limits queue_limits_start_update(struct request_queue *q) - __acquires(q->limits_lock) { mutex_lock(&q->limits_lock); return q->limits; @@ -927,26 +943,31 @@ static inline void queue_limits_cancel_update(struct request_queue *q) } /* + * These helpers are for drivers that have sloppy feature negotiation and might + * have to disable DISCARD, WRITE_ZEROES or SECURE_DISCARD from the I/O + * completion handler when the device returned an indicator that the respective + * feature is not actually supported. They are racy and the driver needs to + * cope with that. Try to avoid this scheme if you can. + */ +static inline void blk_queue_disable_discard(struct request_queue *q) +{ + q->limits.max_discard_sectors = 0; +} + +static inline void blk_queue_disable_secure_erase(struct request_queue *q) +{ + q->limits.max_secure_erase_sectors = 0; +} + +static inline void blk_queue_disable_write_zeroes(struct request_queue *q) +{ + q->limits.max_write_zeroes_sectors = 0; +} + +/* * Access functions for manipulating queue properties */ -extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors); -extern void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors); -extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_same_sectors); -extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); -extern void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors); -extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size); -extern void blk_queue_alignment_offset(struct request_queue *q, - unsigned int alignment); -void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); -extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); @@ -954,9 +975,7 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); -extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); -extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); @@ -1077,6 +1096,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ +#define BLKDEV_ZERO_KILLABLE (1 << 2) /* interruptible by fatal signals */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, @@ -1204,12 +1224,7 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev) static inline unsigned queue_logical_block_size(const struct request_queue *q) { - int retval = 512; - - if (q && q->limits.logical_block_size) - retval = q->limits.logical_block_size; - - return retval; + return q->limits.logical_block_size; } static inline unsigned int bdev_logical_block_size(struct block_device *bdev) @@ -1295,29 +1310,38 @@ static inline bool bdev_nonrot(struct block_device *bdev) static inline bool bdev_synchronous(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_SYNCHRONOUS, - &bdev_get_queue(bdev)->queue_flags); + return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; } static inline bool bdev_stable_writes(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_STABLE_WRITES, - &bdev_get_queue(bdev)->queue_flags); + struct request_queue *q = bdev_get_queue(bdev); + + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) + return true; + return q->limits.features & BLK_FEAT_STABLE_WRITES; +} + +static inline bool blk_queue_write_cache(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_WRITE_CACHE) && + !(q->limits.flags & BLK_FLAG_WRITE_CACHE_DISABLED); } static inline bool bdev_write_cache(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags); + return blk_queue_write_cache(bdev_get_queue(bdev)); } static inline bool bdev_fua(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); + return bdev_get_queue(bdev)->limits.features & BLK_FEAT_FUA; } static inline bool bdev_nowait(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); + return bdev->bd_disk->queue->limits.features & BLK_FEAT_NOWAIT; } static inline bool bdev_is_zoned(struct block_device *bdev) @@ -1359,7 +1383,31 @@ static inline bool bdev_is_zone_start(struct block_device *bdev, static inline int queue_dma_alignment(const struct request_queue *q) { - return q ? q->limits.dma_alignment : 511; + return q->limits.dma_alignment; +} + +static inline unsigned int +queue_atomic_write_unit_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_max; +} + +static inline unsigned int +queue_atomic_write_unit_min_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_min; +} + +static inline unsigned int +queue_atomic_write_boundary_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT; +} + +static inline unsigned int +queue_atomic_write_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_max_sectors << SECTOR_SHIFT; } static inline unsigned int bdev_dma_alignment(struct block_device *bdev) @@ -1374,10 +1422,16 @@ static inline bool bdev_iter_is_aligned(struct block_device *bdev, bdev_logical_block_size(bdev) - 1); } +static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) +{ + return lim->dma_alignment | lim->dma_pad_mask; +} + static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { - unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; + unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); + return !(addr & alignment) && !(len & alignment); } @@ -1563,7 +1617,7 @@ int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); -void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); +void bdev_statx(struct path *, struct kstat *, u32); void printk_all_partitions(void); int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else @@ -1581,7 +1635,8 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } -static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +static inline void bdev_statx(struct path *path, struct kstat *stat, + u32 request_mask) { } static inline void printk_all_partitions(void) @@ -1603,6 +1658,27 @@ struct io_comp_batch { void (*complete)(struct io_comp_batch *); }; +static inline bool bdev_can_atomic_write(struct block_device *bdev) +{ + struct request_queue *bd_queue = bdev->bd_queue; + struct queue_limits *limits = &bd_queue->limits; + + if (!limits->atomic_write_unit_min) + return false; + + if (bdev_is_partition(bdev)) { + sector_t bd_start_sect = bdev->bd_start_sect; + unsigned int alignment = + max(limits->atomic_write_unit_min, + limits->atomic_write_hw_boundary); + + if (!IS_ALIGNED(bd_start_sect, alignment >> SECTOR_SHIFT)) + return false; + } + + return true; +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/bvec.h b/include/linux/bvec.h index bd1e361b351c..f41c7f0ef91e 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -280,4 +280,18 @@ static inline void *bvec_virt(struct bio_vec *bvec) return page_address(bvec->bv_page) + bvec->bv_offset; } +/** + * bvec_phys - return the physical address for a bvec + * @bvec: bvec to return the physical address for + */ +static inline phys_addr_t bvec_phys(const struct bio_vec *bvec) +{ + /* + * Note this open codes page_to_phys because page_to_phys is defined in + * <asm/io.h>, which we don't want to pull in here. If it ever moves to + * a sensible place we should start using it. + */ + return PFN_PHYS(page_to_pfn(bvec->bv_page)) + bvec->bv_offset; +} + #endif /* __LINUX_BVEC_H */ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861cd36..b36690ca0d3f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -119,7 +119,12 @@ enum { /* * Enable hugetlb accounting for the memory controller. */ - CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), + + /* + * Enable legacy local pids.events. + */ + CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20), }; /* cftype->flags */ diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index c2d09bc4f976..80c4181e194a 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -63,17 +63,20 @@ #define __free(_name) __cleanup(__free_##_name) -#define __get_and_null_ptr(p) \ - ({ __auto_type __ptr = &(p); \ - __auto_type __val = *__ptr; \ - *__ptr = NULL; __val; }) +#define __get_and_null(p, nullvalue) \ + ({ \ + __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = nullvalue; \ + __val; \ + }) static inline __must_check const volatile void * __must_check_fn(const volatile void *val) { return val; } #define no_free_ptr(p) \ - ((typeof(p)) __must_check_fn(__get_and_null_ptr(p))) + ((typeof(p)) __must_check_fn(__get_and_null(p, NULL))) #define return_ptr(p) return no_free_ptr(p) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 0ad8b550bb4b..d35b677b08fe 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -21,6 +21,7 @@ #include <asm/div64.h> #include <asm/io.h> +struct clocksource_base; struct clocksource; struct module; @@ -50,6 +51,7 @@ struct module; * multiplication * @name: Pointer to clocksource name * @list: List head for registration (internal) + * @freq_khz: Clocksource frequency in khz. * @rating: Rating value for selection (higher is better) * To avoid rating inflation the following * list should give you a guide as to how @@ -70,6 +72,8 @@ struct module; * validate the clocksource from which the snapshot was * taken. * @flags: Flags describing special properties + * @base: Hardware abstraction for clock on which a clocksource + * is based * @enable: Optional function to enable the clocksource * @disable: Optional function to disable the clocksource * @suspend: Optional suspend function for the clocksource @@ -107,10 +111,12 @@ struct clocksource { u64 max_cycles; const char *name; struct list_head list; + u32 freq_khz; int rating; enum clocksource_ids id; enum vdso_clock_mode vdso_clock_mode; unsigned long flags; + struct clocksource_base *base; int (*enable)(struct clocksource *cs); void (*disable)(struct clocksource *cs); @@ -306,4 +312,25 @@ static inline unsigned int clocksource_get_max_watchdog_retry(void) void clocksource_verify_percpu(struct clocksource *cs); +/** + * struct clocksource_base - hardware abstraction for clock on which a clocksource + * is based + * @id: Defaults to CSID_GENERIC. The id value is used for conversion + * functions which require that the current clocksource is based + * on a clocksource_base with a particular ID in certain snapshot + * functions to allow callers to validate the clocksource from + * which the snapshot was taken. + * @freq_khz: Nominal frequency of the base clock in kHz + * @offset: Offset between the base clock and the clocksource + * @numerator: Numerator of the clock ratio between base clock and the clocksource + * @denominator: Denominator of the clock ratio between base clock and the clocksource + */ +struct clocksource_base { + enum clocksource_ids id; + u32 freq_khz; + u64 offset; + u32 numerator; + u32 denominator; +}; + #endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h index a4fa3436940c..2bb4d8c2f1b0 100644 --- a/include/linux/clocksource_ids.h +++ b/include/linux/clocksource_ids.h @@ -9,6 +9,7 @@ enum clocksource_ids { CSID_X86_TSC_EARLY, CSID_X86_TSC, CSID_X86_KVM_CLK, + CSID_X86_ART, CSID_MAX, }; diff --git a/include/linux/closure.h b/include/linux/closure.h index 59b8c06b11ff..2af44427107d 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -159,6 +159,7 @@ struct closure { #ifdef CONFIG_DEBUG_CLOSURES #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e +#define CLOSURE_MAGIC_STACK 0xc05451cc unsigned int magic; struct list_head all; @@ -323,12 +324,18 @@ static inline void closure_init_stack(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif } static inline void closure_init_stack_release(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +#ifdef CONFIG_DEBUG_CLOSURES + cl->magic = CLOSURE_MAGIC_STACK; +#endif } /** diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 8c252e073bd8..68a24a3a6979 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -194,9 +194,17 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * This data_race() macro is useful for situations in which data races * should be forgiven. One example is diagnostic code that accesses * shared variables but is not a part of the core synchronization design. + * For example, if accesses to a given variable are protected by a lock, + * except for diagnostic code, then the accesses under the lock should + * be plain C-language accesses and those in the diagnostic code should + * use data_race(). This way, KCSAN will complain if buggy lockless + * accesses to that variable are introduced, even if the buggy accesses + * are protected by READ_ONCE() or WRITE_ONCE(). * * This macro *does not* affect normal code generation, but is a hint - * to tooling that data races here are to be ignored. + * to tooling that data races here are to be ignored. If the access must + * be atomic *and* KCSAN should ignore the access, use both data_race() + * and READ_ONCE(), for example, data_race(READ_ONCE(x)). */ #define data_race(expr) \ ({ \ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 7a5785f405b6..89f5c34ce4df 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -27,7 +27,7 @@ * startup callbacks sequentially from CPUHP_OFFLINE + 1 to CPUHP_ONLINE * during a CPU online operation. During a CPU offline operation the * installed teardown callbacks are invoked in the reverse order from - * CPU_ONLINE - 1 down to CPUHP_OFFLINE. + * CPUHP_ONLINE - 1 down to CPUHP_OFFLINE. * * The state space has three sections: PREPARE, STARTING and ONLINE. * @@ -171,6 +171,7 @@ enum cpuhp_state { CPUHP_AP_ARMADA_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, + CPUHP_AP_REALTEK_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, CPUHP_AP_CLINT_TIMER_STARTING, CPUHP_AP_CSKY_TIMER_STARTING, diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bf53e3894aae..bff956f7b2b9 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -71,7 +71,7 @@ extern const struct qstr dotdot_name; # define DNAME_INLINE_LEN 40 /* 192 bytes */ #else # ifdef CONFIG_SMP -# define DNAME_INLINE_LEN 40 /* 128 bytes */ +# define DNAME_INLINE_LEN 36 /* 128 bytes */ # else # define DNAME_INLINE_LEN 44 /* 128 bytes */ # endif @@ -89,13 +89,18 @@ struct dentry { struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ + /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ - struct lockref d_lockref; /* per-dentry lock and refcount */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ + /* --- cacheline 2 boundary (128 bytes) --- */ + struct lockref d_lockref; /* per-dentry lock and refcount + * keep separate from RCU lookup area if + * possible! + */ union { struct list_head d_lru; /* LRU list */ @@ -278,6 +283,8 @@ static inline unsigned d_count(const struct dentry *dentry) return dentry->d_lockref.count; } +ino_t d_parent_ino(struct dentry *dentry); + /* * helper function for dentry_operations.d_dname() members */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 82b2195efaca..15d28164bbbd 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -358,6 +358,13 @@ struct dm_target { bool discards_supported:1; /* + * Automatically set by dm-core if this target supports + * REQ_OP_ZONE_RESET_ALL. Otherwise, this operation will be emulated + * using REQ_OP_ZONE_RESET. Target drivers must not set this manually. + */ + bool zone_reset_all_supported:1; + + /* * Set if this target requires that discards be split on * 'max_discard_sectors' boundaries. */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index bb37ad5cc954..893a1d21dc1c 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -158,6 +158,7 @@ struct fid { #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ +#define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ /** * struct export_operations - for nfsd to communicate with file systems @@ -305,6 +306,7 @@ static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid, extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, + unsigned int flags, int (*acceptable)(void *, struct dentry *), void *context); extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, diff --git a/include/linux/file.h b/include/linux/file.h index 45d0f4800abd..237931f20739 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -97,6 +97,26 @@ extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) +/* + * take_fd() will take care to set @fd to -EBADF ensuring that + * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it + * easier to rely on CLASS(get_unused_fd): + * + * struct file *f; + * + * CLASS(get_unused_fd, fd)(O_CLOEXEC); + * if (fd < 0) + * return fd; + * + * f = dentry_open(&path, O_RDONLY, current_cred()); + * if (IS_ERR(f)) + * return PTR_ERR(fd); + * + * fd_install(fd, f); + * return take_fd(fd); + */ +#define take_fd(fd) __get_and_null(fd, -EBADF) + extern void fd_install(unsigned int fd, struct file *file); int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a..fd34b5755c0b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -125,8 +125,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) +/* File supports atomic writes */ +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) -/* FMODE_* bits 7 to 8 */ +/* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) @@ -317,6 +319,7 @@ struct readahead_control; #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_ATOMIC (__force int) RWF_ATOMIC /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -351,6 +354,7 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ + { IOCB_ATOMIC, "ATOMIC"}, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -660,9 +664,13 @@ struct inode { }; dev_t i_rdev; loff_t i_size; - struct timespec64 __i_atime; - struct timespec64 __i_mtime; - struct timespec64 __i_ctime; /* use inode_*_ctime accessors! */ + time64_t i_atime_sec; + time64_t i_mtime_sec; + time64_t i_ctime_sec; + u32 i_atime_nsec; + u32 i_mtime_nsec; + u32 i_ctime_nsec; + u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; @@ -719,10 +727,10 @@ struct inode { unsigned i_dir_seq; }; - __u32 i_generation; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ + /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif @@ -1538,23 +1546,27 @@ struct timespec64 inode_set_ctime_current(struct inode *inode); static inline time64_t inode_get_atime_sec(const struct inode *inode) { - return inode->__i_atime.tv_sec; + return inode->i_atime_sec; } static inline long inode_get_atime_nsec(const struct inode *inode) { - return inode->__i_atime.tv_nsec; + return inode->i_atime_nsec; } static inline struct timespec64 inode_get_atime(const struct inode *inode) { - return inode->__i_atime; + struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), + .tv_nsec = inode_get_atime_nsec(inode) }; + + return ts; } static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_atime = ts; + inode->i_atime_sec = ts.tv_sec; + inode->i_atime_nsec = ts.tv_nsec; return ts; } @@ -1563,28 +1575,32 @@ static inline struct timespec64 inode_set_atime(struct inode *inode, { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; + return inode_set_atime_to_ts(inode, ts); } static inline time64_t inode_get_mtime_sec(const struct inode *inode) { - return inode->__i_mtime.tv_sec; + return inode->i_mtime_sec; } static inline long inode_get_mtime_nsec(const struct inode *inode) { - return inode->__i_mtime.tv_nsec; + return inode->i_mtime_nsec; } static inline struct timespec64 inode_get_mtime(const struct inode *inode) { - return inode->__i_mtime; + struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), + .tv_nsec = inode_get_mtime_nsec(inode) }; + return ts; } static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_mtime = ts; + inode->i_mtime_sec = ts.tv_sec; + inode->i_mtime_nsec = ts.tv_nsec; return ts; } @@ -1598,23 +1614,27 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode, static inline time64_t inode_get_ctime_sec(const struct inode *inode) { - return inode->__i_ctime.tv_sec; + return inode->i_ctime_sec; } static inline long inode_get_ctime_nsec(const struct inode *inode) { - return inode->__i_ctime.tv_nsec; + return inode->i_ctime_nsec; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) { - return inode->__i_ctime; + struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), + .tv_nsec = inode_get_ctime_nsec(inode) }; + + return ts; } static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { - inode->__i_ctime = ts; + inode->i_ctime_sec = ts.tv_sec; + inode->i_ctime_nsec = ts.tv_nsec; return ts; } @@ -1926,6 +1946,8 @@ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); +bool in_group_or_capable(struct mnt_idmap *idmap, + const struct inode *inode, vfsgid_t vfsgid); /* * This is the "filldir" function type, used by readdir() to let @@ -2685,7 +2707,7 @@ static inline struct file *file_clone_open(struct file *file) } extern int filp_close(struct file *, fl_owner_t id); -extern struct filename *getname_flags(const char __user *, int, int *); +extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); @@ -3029,7 +3051,12 @@ extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); -extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); +struct inode *iget5_locked(struct super_block *, unsigned long, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *); +struct inode *iget5_locked_rcu(struct super_block *, unsigned long, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); extern struct inode *find_inode_nowait(struct super_block *, unsigned long, @@ -3231,6 +3258,9 @@ extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); +void generic_fill_statx_atomic_writes(struct kstat *stat, + unsigned int unit_min, + unsigned int unit_max); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); @@ -3351,6 +3381,10 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_sb_d_ops(struct super_block *sb); +extern int generic_ci_match(const struct inode *parent, + const struct qstr *name, + const struct qstr *folded_name, + const u8 *de_name, u32 de_name_len); static inline bool sb_has_encoding(const struct super_block *sb) { @@ -3403,7 +3437,8 @@ static inline int iocb_flags(struct file *file) return res; } -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, + int rw_type) { int kiocb_flags = 0; @@ -3422,6 +3457,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_ATOMIC) { + if (rw_type != WRITE) + return -EOPNOTSUPP; + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; @@ -3436,20 +3477,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; } -static inline ino_t parent_ino(struct dentry *dentry) -{ - ino_t res; - - /* - * Don't strictly need d_lock here? If the parent ino could change - * then surely we'd have a deeper race in the caller? - */ - spin_lock(&dentry->d_lock); - res = dentry->d_parent->d_inode->i_ino; - spin_unlock(&dentry->d_lock); - return res; -} - /* Transaction based IO helpers */ /* @@ -3574,7 +3601,7 @@ static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, - parent_ino(file->f_path.dentry), DT_DIR); + d_parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { @@ -3613,4 +3640,23 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +static inline bool vfs_empty_path(int dfd, const char __user *path) +{ + char c; + + if (dfd < 0) + return false; + + /* We now allow NULL to be used for empty path. */ + if (!path) + return true; + + if (unlikely(get_user(c, path))) + return false; + + return !c; +} + +bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); + #endif /* _LINUX_FS_H */ diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index d3350979115f..6cf713a7e6c6 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -28,7 +28,7 @@ typedef int fs_param_type(struct p_log *, */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev, - fs_param_is_path, fs_param_is_fd; + fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid; /* * Specification of the type of value a parameter wants. @@ -57,6 +57,8 @@ struct fs_parse_result { int int_32; /* For spec_s32/spec_enum */ unsigned int uint_32; /* For spec_u32{,_octal,_hex}/spec_enum */ u64 uint_64; /* For spec_u64 */ + kuid_t uid; + kgid_t gid; }; }; @@ -131,6 +133,8 @@ static inline bool fs_validate_description(const char *name, #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) #define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) +#define fsparam_uid(NAME, OPT) __fsparam(fs_param_is_uid, NAME, OPT, 0, NULL) +#define fsparam_gid(NAME, OPT) __fsparam(fs_param_is_gid, NAME, OPT, 0, NULL) /* String parameter that allows empty argument */ #define fsparam_string_empty(NAME, OPT) \ diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index bdf7f3eddf0a..4c91a019972b 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -19,6 +19,7 @@ enum fscache_cache_trace; enum fscache_cookie_trace; enum fscache_access_trace; +enum fscache_volume_trace; enum fscache_cache_state { FSCACHE_CACHE_IS_NOT_PRESENT, /* No cache is present for this name */ @@ -97,6 +98,11 @@ extern void fscache_withdraw_cookie(struct fscache_cookie *cookie); extern void fscache_io_error(struct fscache_cache *cache); +extern struct fscache_volume * +fscache_try_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +extern void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); extern void fscache_end_volume_access(struct fscache_volume *volume, struct fscache_cookie *cookie, enum fscache_access_trace why); diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 4da80e92f804..278620e063ab 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -112,7 +112,13 @@ static inline int fsnotify_file(struct file *file, __u32 mask) { const struct path *path; - if (file->f_mode & FMODE_NONOTIFY) + /* + * FMODE_NONOTIFY are fds generated by fanotify itself which should not + * generate new events. We also don't want to generate events for + * FMODE_PATH fds (involves open & close events) as they are just + * handle creation / destruction events and not "real" file events. + */ + if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH)) return 0; path = &file->f_path; diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7abdc0927124..3bb6198d1523 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -50,7 +50,7 @@ struct io_wq_work_list { struct io_wq_work { struct io_wq_work_node list; - unsigned flags; + atomic_t flags; /* place it here instead of io_kiocb as it fills padding and saves 4B */ int cancel_seq; }; @@ -210,14 +210,6 @@ struct io_submit_state { struct blk_plug plug; }; -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; - atomic_t refs; - atomic_t ops; -}; - struct io_alloc_cache { void **entries; unsigned int nr_cached; @@ -372,7 +364,6 @@ struct io_ring_ctx { struct io_restriction restrictions; /* slow path rsrc auxilary data, used by update/register */ - struct io_mapped_ubuf *dummy_ubuf; struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; @@ -405,6 +396,9 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; + struct io_alloc_cache msg_cache; + spinlock_t msg_lock; + #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index e799b1f8d05b..49eef10c8e59 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -9,15 +9,16 @@ #define _MISC_CGROUP_H_ /** - * Types of misc cgroup entries supported by the host. + * enum misc_res_type - Types of misc cgroup entries supported by the host. */ enum misc_res_type { #ifdef CONFIG_KVM_AMD_SEV - /* AMD SEV ASIDs resource */ + /** @MISC_CG_RES_SEV: AMD SEV ASIDs resource */ MISC_CG_RES_SEV, - /* AMD SEV-ES ASIDs resource */ + /** @MISC_CG_RES_SEV_ES: AMD SEV-ES ASIDs resource */ MISC_CG_RES_SEV_ES, #endif + /** @MISC_CG_RES_TYPES: count of enum misc_res_type constants */ MISC_CG_RES_TYPES }; @@ -30,13 +31,16 @@ struct misc_cg; /** * struct misc_res: Per cgroup per misc type resource * @max: Maximum limit on the resource. + * @watermark: Historical maximum usage of the resource. * @usage: Current usage of the resource. * @events: Number of times, the resource limit exceeded. */ struct misc_res { u64 max; + atomic64_t watermark; atomic64_t usage; atomic64_t events; + atomic64_t events_local; }; /** @@ -50,6 +54,8 @@ struct misc_cg { /* misc.events */ struct cgroup_file events_file; + /* misc.events.local */ + struct cgroup_file events_local_file; struct misc_res res[MISC_CG_RES_TYPES]; }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5df52e15f7d6..d45bfb7cf81d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2029,7 +2029,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 pcc_ifa2[0x1]; u8 reserved_at_3f1[0xf]; - u8 reserved_at_400[0x400]; + u8 reserved_at_400[0x40]; + + u8 reserved_at_440[0x8]; + u8 max_num_eqs_24b[0x18]; + u8 reserved_at_460[0x3a0]; }; enum mlx5_ifc_flow_destination_type { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 586a8f0104d7..1dc6248feb83 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1979,8 +1979,9 @@ static inline int subsection_map_index(unsigned long pfn) static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); + struct mem_section_usage *usage = READ_ONCE(ms->usage); - return test_bit(idx, READ_ONCE(ms->usage)->subsection_map); + return usage ? test_bit(idx, usage->subsection_map) : 0; } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) diff --git a/include/linux/namei.h b/include/linux/namei.h index 967aa9ea9f96..8ec8fed3bce8 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -50,13 +50,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; extern int path_pts(struct path *path); -extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); - -static inline int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) -{ - return user_path_at_empty(dfd, name, flags, path, NULL); -} +extern int user_path_at(int, const char __user *, unsigned, struct path *); struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 5601d14e2886..dab6a1734a22 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,6 +42,17 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns->ns), \ + struct ipc_namespace *: &(__ns->ns), \ + struct net *: &(__ns->ns), \ + struct pid_namespace *: &(__ns->ns), \ + struct mnt_namespace *: &(__ns->ns), \ + struct time_namespace *: &(__ns->ns), \ + struct user_namespace *: &(__ns->ns), \ + struct uts_namespace *: &(__ns->ns)) + /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. @@ -112,4 +123,6 @@ static inline void get_nsproxy(struct nsproxy *ns) refcount_inc(&ns->count); } +DEFINE_FREE(put_nsproxy, struct nsproxy *, if (_T) put_nsproxy(_T)) + #endif diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 4109f1bd6128..89ea1ebd975a 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -920,6 +920,9 @@ struct nvmet_fc_target_port { * further references to hosthandle. * Entrypoint is Mandatory if the lldd calls nvmet_fc_invalidate_host(). * + * @host_traddr: called by the transport to retrieve the node name and + * port name of the host port address. + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -975,6 +978,7 @@ struct nvmet_fc_target_template { void (*ls_abort)(struct nvmet_fc_target_port *targetport, void *hosthandle, struct nvmefc_ls_req *lsreq); void (*host_release)(void *hosthandle); + int (*host_traddr)(void *hosthandle, u64 *wwnn, u64 *wwpn); u32 max_hw_queues; u16 max_sgl_segments; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c693ac344ec0..c12a329dd463 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -25,6 +25,9 @@ #define NVME_NSID_ALL 0xffffffff +/* Special NSSR value, 'NVMe' */ +#define NVME_SUBSYS_RESET 0x4E564D65 + enum nvme_subsys_type { /* Referral to another discovery type target subsystem */ NVME_NQN_DISC = 1, @@ -1848,6 +1851,7 @@ enum { /* * Generic Command Status: */ + NVME_SCT_GENERIC = 0x0, NVME_SC_SUCCESS = 0x0, NVME_SC_INVALID_OPCODE = 0x1, NVME_SC_INVALID_FIELD = 0x2, @@ -1895,6 +1899,7 @@ enum { /* * Command Specific Status: */ + NVME_SCT_COMMAND_SPECIFIC = 0x100, NVME_SC_CQ_INVALID = 0x100, NVME_SC_QID_INVALID = 0x101, NVME_SC_QUEUE_SIZE = 0x102, @@ -1968,6 +1973,7 @@ enum { /* * Media and Data Integrity Errors: */ + NVME_SCT_MEDIA_ERROR = 0x200, NVME_SC_WRITE_FAULT = 0x280, NVME_SC_READ_ERROR = 0x281, NVME_SC_GUARD_CHECK = 0x282, @@ -1980,6 +1986,7 @@ enum { /* * Path-related Errors: */ + NVME_SCT_PATH = 0x300, NVME_SC_INTERNAL_PATH_ERROR = 0x300, NVME_SC_ANA_PERSISTENT_LOSS = 0x301, NVME_SC_ANA_INACCESSIBLE = 0x302, @@ -1988,11 +1995,17 @@ enum { NVME_SC_HOST_PATH_ERROR = 0x370, NVME_SC_HOST_ABORTED_CMD = 0x371, - NVME_SC_CRD = 0x1800, - NVME_SC_MORE = 0x2000, - NVME_SC_DNR = 0x4000, + NVME_SC_MASK = 0x00ff, /* Status Code */ + NVME_SCT_MASK = 0x0700, /* Status Code Type */ + NVME_SCT_SC_MASK = NVME_SCT_MASK | NVME_SC_MASK, + + NVME_STATUS_CRD = 0x1800, /* Command Retry Delayed */ + NVME_STATUS_MORE = 0x2000, + NVME_STATUS_DNR = 0x4000, /* Do Not Retry */ }; +#define NVME_SCT(status) ((status) >> 8 & 7) + struct nvme_completion { /* * Used by Admin and Fabrics commands to return data: diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 1acf5bac7f50..8c236c651d1d 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -230,7 +230,13 @@ static inline int folio_ref_dec_return(struct folio *folio) static inline bool page_ref_add_unless(struct page *page, int nr, int u) { - bool ret = atomic_add_unless(&page->_refcount, nr, u); + bool ret = false; + + rcu_read_lock(); + /* avoid writing to the vmemmap area being remapped */ + if (!page_is_fake_head(page) && page_ref_count(page) != u) + ret = atomic_add_unless(&page->_refcount, nr, u); + rcu_read_unlock(); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); @@ -258,54 +264,9 @@ static inline bool folio_try_get(struct folio *folio) return folio_ref_add_unless(folio, 1, 0); } -static inline bool folio_ref_try_add_rcu(struct folio *folio, int count) -{ -#ifdef CONFIG_TINY_RCU - /* - * The caller guarantees the folio will not be freed from interrupt - * context, so (on !SMP) we only need preemption to be disabled - * and TINY_RCU does that for us. - */ -# ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic() && !irqs_disabled()); -# endif - VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio); - folio_ref_add(folio, count); -#else - if (unlikely(!folio_ref_add_unless(folio, count, 0))) { - /* Either the folio has been freed, or will be freed. */ - return false; - } -#endif - return true; -} - -/** - * folio_try_get_rcu - Attempt to increase the refcount on a folio. - * @folio: The folio. - * - * This is a version of folio_try_get() optimised for non-SMP kernels. - * If you are still holding the rcu_read_lock() after looking up the - * page and know that the page cannot have its refcount decreased to - * zero in interrupt context, you can use this instead of folio_try_get(). - * - * Example users include get_user_pages_fast() (as pages are not unmapped - * from interrupt context) and the page cache lookups (as pages are not - * truncated from interrupt context). We also know that pages are not - * frozen in interrupt context for the purposes of splitting or migration. - * - * You can also use this function if you're holding a lock that prevents - * pages being frozen & removed; eg the i_pages lock for the page cache - * or the mmap_lock or page table lock for page tables. In this case, - * it will always succeed, and you could have used a plain folio_get(), - * but it's sometimes more convenient to have a common function called - * from both locked and RCU-protected contexts. - * - * Return: True if the reference count was successfully incremented. - */ -static inline bool folio_try_get_rcu(struct folio *folio) +static inline bool folio_ref_try_add(struct folio *folio, int count) { - return folio_ref_try_add_rcu(folio, 1); + return folio_ref_add_unless(folio, count, 0); } static inline int page_ref_freeze(struct page *page, int count) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 59f1df0cde5a..a0a026d2d244 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -354,11 +354,18 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) * a good order (that's 1MB if you're using 4kB pages) */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#define PREFERRED_MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER #else -#define MAX_PAGECACHE_ORDER 8 +#define PREFERRED_MAX_PAGECACHE_ORDER 8 #endif +/* + * xas_split_alloc() does not support arbitrary orders. This implies no + * 512MB THP on ARM64 with 64KB base page size. + */ +#define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) +#define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) + /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The file. diff --git a/include/linux/path.h b/include/linux/path.h index 475225a03d0d..ca073e70decd 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -24,4 +24,13 @@ static inline void path_put_init(struct path *path) *path = (struct path) { }; } +/* + * Cleanup macro for use with __free(path_put). Avoids dereference and + * copying @path unlike DEFINE_FREE(). path_put() will handle the empty + * path correctly just ensure @path is initialized: + * + * struct path path __free(path_put) = {}; + */ +#define __free_path_put path_put + #endif /* _LINUX_PATH_H */ diff --git a/include/linux/phy.h b/include/linux/phy.h index e6e83304558e..3be430cf3132 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1122,7 +1122,7 @@ struct phy_driver { u8 index, enum led_brightness value); /** - * @led_blink_set: Set a PHY LED brightness. Index indicates + * @led_blink_set: Set a PHY LED blinking. Index indicates * which of the PHYs led should be configured to blink. Delays * are in milliseconds and if both are zero then a sensible * default should be chosen. The call should adjust the diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 659d13a7ddaa..ba95c06675e1 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -80,36 +80,35 @@ struct rcu_cblist { * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED | * | | * | Callbacks processed by rcu_core() from softirqs or local | - * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads, | - * | allowing nocb_timer to be armed. | + * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads. | * ---------------------------------------------------------------------------- * | * v - * ----------------------------------- - * | | - * v v - * --------------------------------------- ----------------------------------| - * | SEGCBLIST_RCU_CORE | | | SEGCBLIST_RCU_CORE | | - * | SEGCBLIST_LOCKING | | | SEGCBLIST_LOCKING | | - * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | SEGCBLIST_KTHREAD_GP | - * | | | | - * | | | | - * | CB kthread woke up and | | GP kthread woke up and | - * | acknowledged SEGCBLIST_OFFLOADED. | | acknowledged SEGCBLIST_OFFLOADED| - * | Processes callbacks concurrently | | | - * | with rcu_core(), holding | | | - * | nocb_lock. | | | - * --------------------------------------- ----------------------------------- - * | | - * ----------------------------------- + * ---------------------------------------------------------------------------- + * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED | + * | + unparked CB kthread | + * | | + * | CB kthread got unparked and processes callbacks concurrently with | + * | rcu_core(), holding nocb_lock. | + * --------------------------------------------------------------------------- + * | + * v + * ---------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | + * | | + * | GP kthread woke up and acknowledged nocb_lock. | + * ---------------------------------------- ----------------------------------- * | * v * |--------------------------------------------------------------------------| - * | SEGCBLIST_LOCKING | | - * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | * | SEGCBLIST_KTHREAD_GP | | - * | SEGCBLIST_KTHREAD_CB | + * | + unparked CB kthread | * | | * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops | * | handling callbacks. Enable bypass queueing. | @@ -125,8 +124,8 @@ struct rcu_cblist { * |--------------------------------------------------------------------------| * | SEGCBLIST_LOCKING | | * | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | * | ignores callbacks. Bypass enqueue is enabled. | @@ -137,11 +136,11 @@ struct rcu_cblist { * | SEGCBLIST_RCU_CORE | | * | SEGCBLIST_LOCKING | | * | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | - * | handles callbacks concurrently. Bypass enqueue is enabled. | + * | handles callbacks concurrently. Bypass enqueue is disabled. | * | Invoke RCU core so we make sure not to preempt it in the middle with | * | leaving some urgent work unattended within a jiffy. | * ---------------------------------------------------------------------------- @@ -150,42 +149,31 @@ struct rcu_cblist { * |--------------------------------------------------------------------------| * | SEGCBLIST_RCU_CORE | | * | SEGCBLIST_LOCKING | | - * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | + * | + unparked CB kthread | * | | * | CB/GP kthreads and local rcu_core() handle callbacks concurrently | - * | holding nocb_lock. Wake up CB and GP kthreads if necessary. Disable | - * | bypass enqueue. | + * | holding nocb_lock. Wake up GP kthread if necessary. | * ---------------------------------------------------------------------------- * | * v - * ----------------------------------- - * | | - * v v - * ---------------------------------------------------------------------------| - * | | | - * | SEGCBLIST_RCU_CORE | | SEGCBLIST_RCU_CORE | | - * | SEGCBLIST_LOCKING | | SEGCBLIST_LOCKING | | - * | SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | - * | | | - * | GP kthread woke up and | CB kthread woke up and | - * | acknowledged the fact that | acknowledged the fact that | - * | SEGCBLIST_OFFLOADED got cleared. | SEGCBLIST_OFFLOADED got cleared. | - * | | The CB kthread goes to sleep | - * | The callbacks from the target CPU | until it ever gets re-offloaded. | - * | will be ignored from the GP kthread | | - * | loop. | | + * |--------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | + * | + unparked CB kthread | + * | | + * | GP kthread woke up and acknowledged the fact that SEGCBLIST_OFFLOADED | + * | got cleared. The callbacks from the target CPU will be ignored from the| + * | GP kthread loop. | * ---------------------------------------------------------------------------- - * | | - * ----------------------------------- * | * v * ---------------------------------------------------------------------------- * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | + * | + parked CB kthread | * | | - * | Callbacks processed by rcu_core() from softirqs or local | - * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | - * | Flush pending nocb_timer. Flush nocb bypass callbacks. | + * | CB kthread is parked. Callbacks processed by rcu_core() from softirqs or | + * | local rcuc kthread, while holding nocb_lock. | * ---------------------------------------------------------------------------- * | * v diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dfd2399f2cde..be450a3477be 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -209,7 +209,6 @@ void synchronize_rcu_tasks_rude(void); #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); -void exit_tasks_rcu_stop(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) @@ -218,7 +217,6 @@ void exit_tasks_rcu_finish(void); #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } -static inline void exit_tasks_rcu_stop(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -421,11 +419,71 @@ static inline void rcu_preempt_sleep_check(void) { } "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) +// See RCU_LOCKDEP_WARN() for an explanation of the double call to +// debug_lockdep_rcu_enabled(). +static inline bool lockdep_assert_rcu_helper(bool c) +{ + return debug_lockdep_rcu_enabled() && + (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) && + debug_lockdep_rcu_enabled(); +} + +/** + * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock() + * + * Splats if lockdep is enabled and there is no rcu_read_lock() in effect. + */ +#define lockdep_assert_in_rcu_read_lock() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map))) + +/** + * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh() + * + * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect. + * Note that local_bh_disable() and friends do not suffice here, instead an + * actual rcu_read_lock_bh() is required. + */ +#define lockdep_assert_in_rcu_read_lock_bh() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map))) + +/** + * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched() + * + * Splats if lockdep is enabled and there is no rcu_read_lock_sched() + * in effect. Note that preempt_disable() and friends do not suffice here, + * instead an actual rcu_read_lock_sched() is required. + */ +#define lockdep_assert_in_rcu_read_lock_sched() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map))) + +/** + * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader + * + * Splats if lockdep is enabled and there is no RCU reader of any + * type in effect. Note that regions of code protected by things like + * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify + * as RCU readers. + * + * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY + * kernels that are not also built with PREEMPT_COUNT. But if you have + * lockdep enabled, you might as well also enable PREEMPT_COUNT. + */ +#define lockdep_assert_in_rcu_reader() \ + WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) && \ + !lock_is_held(&rcu_bh_lock_map) && \ + !lock_is_held(&rcu_sched_lock_map) && \ + preemptible())) + #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c)) #define rcu_sleep_check() do { } while (0) +#define lockdep_assert_in_rcu_read_lock() do { } while (0) +#define lockdep_assert_in_rcu_read_lock_bh() do { } while (0) +#define lockdep_assert_in_rcu_read_lock_sched() do { } while (0) +#define lockdep_assert_in_rcu_reader() do { } while (0) + #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 61591ac6eab6..a5f4b48fca18 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2192,13 +2192,13 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); #ifdef CONFIG_MEM_ALLOC_PROFILING -static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) +static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { swap(current->alloc_tag, tag); return tag; } -static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) +static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); diff --git a/include/linux/socket.h b/include/linux/socket.h index 89d16b90370b..c1f16cdab677 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); +extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, + int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); +extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 98fdef6e28f2..67b9a15a5330 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -533,6 +533,9 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @queue_empty: signal green light for opportunistically skipping the queue * for spi_sync transfers. * @must_async: disable all fast paths in the core + * @defer_optimize_message: set to true if controller cannot pre-optimize messages + * and needs to defer the optimization step until the message is actually + * being transferred * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals @@ -776,6 +779,7 @@ struct spi_controller { /* Flag for enabling opportunistic skipping of the queue in spi_sync */ bool queue_empty; bool must_async; + bool defer_optimize_message; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 236610e4a8fa..6f6cb5fc1242 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -57,10 +57,45 @@ void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); void synchronize_srcu(struct srcu_struct *ssp); + +#define SRCU_GET_STATE_COMPLETED 0x1 + +/** + * get_completed_synchronize_srcu - Return a pre-completed polled state cookie + * + * Returns a value that poll_state_synchronize_srcu() will always treat + * as a cookie whose grace period has already completed. + */ +static inline unsigned long get_completed_synchronize_srcu(void) +{ + return SRCU_GET_STATE_COMPLETED; +} + unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); +// Maximum number of unsigned long values corresponding to +// not-yet-completed SRCU grace periods. +#define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2 + +/** + * same_state_synchronize_srcu - Are two old-state values identical? + * @oldstate1: First old-state value. + * @oldstate2: Second old-state value. + * + * The two old-state values must have been obtained from either + * get_state_synchronize_srcu(), start_poll_synchronize_srcu(), or + * get_completed_synchronize_srcu(). Returns @true if the two values are + * identical and @false otherwise. This allows structures whose lifetimes + * are tracked by old-state values to push these values to a list header, + * allowing those structures to be slightly smaller. + */ +static inline bool same_state_synchronize_srcu(unsigned long oldstate1, unsigned long oldstate2) +{ + return oldstate1 == oldstate2; +} + #ifdef CONFIG_NEED_SRCU_NMI_SAFE int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); diff --git a/include/linux/stat.h b/include/linux/stat.h index bf92441dbad2..3d900c86981c 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -54,6 +54,9 @@ struct kstat { u32 dio_offset_align; u64 change_cookie; u64 subvol; + u32 atomic_write_unit_min; + u32 atomic_write_unit_max; + u32 atomic_write_segments_max; }; /* These definitions are internal to the kernel for now. Mainly used by nfsd. */ diff --git a/include/linux/swap.h b/include/linux/swap.h index bd450023b9a4..e685e93ba354 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -354,7 +354,8 @@ static inline swp_entry_t page_swap_entry(struct page *page) } /* linux/mm/workingset.c */ -bool workingset_test_recent(void *shadow, bool file, bool *workingset); +bool workingset_test_recent(void *shadow, bool file, bool *workingset, + bool flush); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 63424af87bba..fff820c3e93e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -859,9 +859,15 @@ asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource, const struct rlimit64 __user *new_rlim, struct rlimit64 __user *old_rlim); asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags); +#if defined(CONFIG_ARCH_SPLIT_ARG64) +asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, + unsigned int mask_1, unsigned int mask_2, + int dfd, const char __user * pathname); +#else asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, u64 mask, int fd, const char __user *pathname); +#endif asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, struct file_handle __user *handle, int __user *mnt_id, int flag); diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 248f4ac95642..2c59fe3efcd4 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -41,18 +41,12 @@ static inline u32 t10_pi_ref_tag(struct request *rq) { unsigned int shift = ilog2(queue_logical_block_size(rq->q)); -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (rq->q->integrity.interval_exp) - shift = rq->q->integrity.interval_exp; -#endif + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + rq->q->limits.integrity.interval_exp) + shift = rq->q->limits.integrity.interval_exp; return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } -extern const struct blk_integrity_profile t10_pi_type1_crc; -extern const struct blk_integrity_profile t10_pi_type1_ip; -extern const struct blk_integrity_profile t10_pi_type3_crc; -extern const struct blk_integrity_profile t10_pi_type3_ip; - struct crc64_pi_tuple { __be64 guard_tag; __be16 app_tag; @@ -72,14 +66,10 @@ static inline u64 ext_pi_ref_tag(struct request *rq) { unsigned int shift = ilog2(queue_logical_block_size(rq->q)); -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (rq->q->integrity.interval_exp) - shift = rq->q->integrity.interval_exp; -#endif + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + rq->q->limits.integrity.interval_exp) + shift = rq->q->limits.integrity.interval_exp; return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); } -extern const struct blk_integrity_profile ext_pi_type1_crc64; -extern const struct blk_integrity_profile ext_pi_type3_crc64; - #endif diff --git a/include/linux/tick.h b/include/linux/tick.h index 4924a33700b7..72744638c5b0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -139,7 +139,6 @@ extern void tick_nohz_irq_exit(void); extern bool tick_nohz_idle_got_tick(void); extern ktime_t tick_nohz_get_next_hrtimer(void); extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next); -extern unsigned long tick_nohz_get_idle_calls(void); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 0ea7823b7f31..fc12a9ba2c88 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -310,12 +310,18 @@ struct system_device_crosststamp { * timekeeping code to verify comparability of two cycle values. * The default ID, CSID_GENERIC, does not identify a specific * clocksource. + * @use_nsecs: @cycles is in nanoseconds. */ struct system_counterval_t { u64 cycles; enum clocksource_ids cs_id; + bool use_nsecs; }; +extern bool ktime_real_to_base_clock(ktime_t treal, + enum clocksource_ids base_id, u64 *cycles); +extern bool timekeeping_clocksource_has_base(enum clocksource_ids id); + /* * Get cross timestamp between system clock and device clock */ diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 21a67dc9efe8..e93ee8d936a9 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -490,9 +490,16 @@ static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle) { } #endif + +static inline struct tpm2_auth *tpm2_chip_auth(struct tpm_chip *chip) +{ #ifdef CONFIG_TCG_TPM2_HMAC + return chip->auth; +#else + return NULL; +#endif +} -int tpm2_start_auth_session(struct tpm_chip *chip); void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle, u8 *name); void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, @@ -504,9 +511,27 @@ static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, u8 *passphrase, int passphraselen) { - tpm_buf_append_hmac_session(chip, buf, attributes, passphrase, - passphraselen); + struct tpm_header *head; + int offset; + + if (tpm2_chip_auth(chip)) { + tpm_buf_append_hmac_session(chip, buf, attributes, passphrase, passphraselen); + } else { + offset = buf->handles * 4 + TPM_HEADER_SIZE; + head = (struct tpm_header *)buf->data; + + /* + * If the only sessions are optional, the command tag must change to + * TPM2_ST_NO_SESSIONS. + */ + if (tpm_buf_length(buf) == offset) + head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS); + } } + +#ifdef CONFIG_TCG_TPM2_HMAC + +int tpm2_start_auth_session(struct tpm_chip *chip); void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf); int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf, int rc); @@ -521,56 +546,6 @@ static inline int tpm2_start_auth_session(struct tpm_chip *chip) static inline void tpm2_end_auth_session(struct tpm_chip *chip) { } -static inline void tpm_buf_append_name(struct tpm_chip *chip, - struct tpm_buf *buf, - u32 handle, u8 *name) -{ - tpm_buf_append_u32(buf, handle); - /* count the number of handles in the upper bits of flags */ - buf->handles++; -} -static inline void tpm_buf_append_hmac_session(struct tpm_chip *chip, - struct tpm_buf *buf, - u8 attributes, u8 *passphrase, - int passphraselen) -{ - /* offset tells us where the sessions area begins */ - int offset = buf->handles * 4 + TPM_HEADER_SIZE; - u32 len = 9 + passphraselen; - - if (tpm_buf_length(buf) != offset) { - /* not the first session so update the existing length */ - len += get_unaligned_be32(&buf->data[offset]); - put_unaligned_be32(len, &buf->data[offset]); - } else { - tpm_buf_append_u32(buf, len); - } - /* auth handle */ - tpm_buf_append_u32(buf, TPM2_RS_PW); - /* nonce */ - tpm_buf_append_u16(buf, 0); - /* attributes */ - tpm_buf_append_u8(buf, 0); - /* passphrase */ - tpm_buf_append_u16(buf, passphraselen); - tpm_buf_append(buf, passphrase, passphraselen); -} -static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, - struct tpm_buf *buf, - u8 attributes, - u8 *passphrase, - int passphraselen) -{ - int offset = buf->handles * 4 + TPM_HEADER_SIZE; - struct tpm_header *head = (struct tpm_header *) buf->data; - - /* - * if the only sessions are optional, the command tag - * must change to TPM2_ST_NO_SESSIONS - */ - if (tpm_buf_length(buf) == offset) - head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS); -} static inline void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf) { diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index fe932ca3bc8c..e372a88e8c3f 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -324,6 +324,17 @@ enum { * claim to support it. */ HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, + + /* + * When this quirk is set, the reserved bits of Primary/Secondary_PHY + * inside the LE Extended Advertising Report events are discarded. + * This is required for some Apple/Broadcom controllers which + * abuse these reserved bits for unrelated flags. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 6a9d063e9f47..534c3386e714 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -38,6 +38,8 @@ int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout, struct sock *sk); +int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout); void hci_cmd_sync_init(struct hci_dev *hdev); void hci_cmd_sync_clear(struct hci_dev *hdev); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index cafc664ee531..45ad37adbe32 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -395,7 +395,7 @@ enum ieee80211_bss_change { BSS_CHANGED_HE_OBSS_PD = 1<<28, BSS_CHANGED_HE_BSS_COLOR = 1<<29, BSS_CHANGED_FILS_DISCOVERY = 1<<30, - BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31, + BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = BIT_ULL(31), BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33), BSS_CHANGED_MLD_TTLM = BIT_ULL(34), diff --git a/include/net/tcx.h b/include/net/tcx.h index 72a3e75e539f..5ce0ce9e0c02 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -13,7 +13,7 @@ struct mini_Qdisc; struct tcx_entry { struct mini_Qdisc __rcu *miniq; struct bpf_mprog_bundle bundle; - bool miniq_active; + u32 miniq_active; struct rcu_head rcu; }; @@ -125,11 +125,16 @@ static inline void tcx_skeys_dec(bool ingress) tcx_dec(); } -static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry, - const bool active) +static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); - tcx_entry(entry)->miniq_active = active; + tcx_entry(entry)->miniq_active++; +} + +static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + tcx_entry(entry)->miniq_active--; } static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h index 843106e1109f..70e1262b2e20 100644 --- a/include/scsi/scsi_proto.h +++ b/include/scsi/scsi_proto.h @@ -120,6 +120,7 @@ #define WRITE_SAME_16 0x93 #define ZBC_OUT 0x94 #define ZBC_IN 0x95 +#define WRITE_ATOMIC_16 0x9c #define SERVICE_ACTION_BIDIRECTIONAL 0x9d #define SERVICE_ACTION_IN_16 0x9e #define SERVICE_ACTION_OUT_16 0x9f diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 0e128ad51460..1527d5d45e01 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -9,9 +9,17 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/tracepoint.h> +#include <uapi/linux/ioprio.h> #define RWBS_LEN 8 +#define IOPRIO_CLASS_STRINGS \ + { IOPRIO_CLASS_NONE, "none" }, \ + { IOPRIO_CLASS_RT, "rt" }, \ + { IOPRIO_CLASS_BE, "be" }, \ + { IOPRIO_CLASS_IDLE, "idle" }, \ + { IOPRIO_CLASS_INVALID, "invalid"} + #ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, @@ -82,6 +90,7 @@ TRACE_EVENT(block_rq_requeue, __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) + __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), @@ -90,16 +99,20 @@ TRACE_EVENT(block_rq_requeue, __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); + __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), - TP_printk("%d,%d %s (%s) %llu + %u [%d]", + TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - (unsigned long long)__entry->sector, - __entry->nr_sector, 0) + (unsigned long long)__entry->sector, __entry->nr_sector, + __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), + IOPRIO_CLASS_STRINGS), + IOPRIO_PRIO_HINT(__entry->ioprio), + IOPRIO_PRIO_LEVEL(__entry->ioprio), 0) ); DECLARE_EVENT_CLASS(block_rq_completion, @@ -113,6 +126,7 @@ DECLARE_EVENT_CLASS(block_rq_completion, __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( int , error ) + __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), @@ -122,16 +136,20 @@ DECLARE_EVENT_CLASS(block_rq_completion, __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); + __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), - TP_printk("%d,%d %s (%s) %llu + %u [%d]", + TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->error) + (unsigned long long)__entry->sector, __entry->nr_sector, + __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), + IOPRIO_CLASS_STRINGS), + IOPRIO_PRIO_HINT(__entry->ioprio), + IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->error) ); /** @@ -180,6 +198,7 @@ DECLARE_EVENT_CLASS(block_rq, __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned int, bytes ) + __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) __dynamic_array( char, cmd, 1 ) @@ -190,17 +209,21 @@ DECLARE_EVENT_CLASS(block_rq, __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); + __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), - TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", + TP_printk("%d,%d %s %u (%s) %llu + %u %s,%u,%u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, __entry->nr_sector, + __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), + IOPRIO_CLASS_STRINGS), + IOPRIO_PRIO_HINT(__entry->ioprio), + IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm) ); /** diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index fadf406b5260..c978fa2893a5 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2556,9 +2556,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count, TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, - TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr), + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr, + u64 last_root_id, u64 last_ino), - TP_ARGS(fs_info, nr_to_scan, nr), + TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino), TP_STRUCT__entry_btrfs( __field( long, nr_to_scan ) @@ -2570,8 +2571,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, TP_fast_assign_btrfs(fs_info, __entry->nr_to_scan = nr_to_scan; __entry->nr = nr; - __entry->last_root_id = fs_info->extent_map_shrinker_last_root; - __entry->last_ino = fs_info->extent_map_shrinker_last_ino; + __entry->last_root_id = last_root_id; + __entry->last_ino = last_ino; ), TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", @@ -2581,9 +2582,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, - TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr), + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr, + u64 last_root_id, u64 last_ino), - TP_ARGS(fs_info, nr_dropped, nr), + TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino), TP_STRUCT__entry_btrfs( __field( long, nr_dropped ) @@ -2595,8 +2597,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, TP_fast_assign_btrfs(fs_info, __entry->nr_dropped = nr_dropped; __entry->nr = nr; - __entry->last_root_id = fs_info->extent_map_shrinker_last_root; - __entry->last_ino = fs_info->extent_map_shrinker_last_ino; + __entry->last_root_id = last_root_id; + __entry->last_ino = last_ino; ), TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", diff --git a/include/trace/events/fscache.h b/include/trace/events/fscache.h index a6190aa1b406..f1a73aa83fbb 100644 --- a/include/trace/events/fscache.h +++ b/include/trace/events/fscache.h @@ -35,12 +35,14 @@ enum fscache_volume_trace { fscache_volume_get_cookie, fscache_volume_get_create_work, fscache_volume_get_hash_collision, + fscache_volume_get_withdraw, fscache_volume_free, fscache_volume_new_acquire, fscache_volume_put_cookie, fscache_volume_put_create_work, fscache_volume_put_hash_collision, fscache_volume_put_relinquish, + fscache_volume_put_withdraw, fscache_volume_see_create_work, fscache_volume_see_hash_wake, fscache_volume_wait_create_work, @@ -120,12 +122,14 @@ enum fscache_access_trace { EM(fscache_volume_get_cookie, "GET cook ") \ EM(fscache_volume_get_create_work, "GET creat") \ EM(fscache_volume_get_hash_collision, "GET hcoll") \ + EM(fscache_volume_get_withdraw, "GET withd") \ EM(fscache_volume_free, "FREE ") \ EM(fscache_volume_new_acquire, "NEW acq ") \ EM(fscache_volume_put_cookie, "PUT cook ") \ EM(fscache_volume_put_create_work, "PUT creat") \ EM(fscache_volume_put_hash_collision, "PUT hcoll") \ EM(fscache_volume_put_relinquish, "PUT relnq") \ + EM(fscache_volume_put_withdraw, "PUT withd") \ EM(fscache_volume_see_create_work, "SEE creat") \ EM(fscache_volume_see_hash_wake, "SEE hwake") \ E_(fscache_volume_wait_create_work, "WAIT crea") diff --git a/include/trace/events/scsi.h b/include/trace/events/scsi.h index 8e2d9b1b0e77..05f1945ed204 100644 --- a/include/trace/events/scsi.h +++ b/include/trace/events/scsi.h @@ -102,6 +102,7 @@ scsi_opcode_name(WRITE_32), \ scsi_opcode_name(WRITE_SAME_32), \ scsi_opcode_name(ATA_16), \ + scsi_opcode_name(WRITE_ATOMIC_16), \ scsi_opcode_name(ATA_12)) #define scsi_hostbyte_name(result) { result, #result } diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h index aaed8e12ad0b..926b1deb1116 100644 --- a/include/uapi/drm/panthor_drm.h +++ b/include/uapi/drm/panthor_drm.h @@ -802,6 +802,9 @@ struct drm_panthor_queue_submit { * Must be 64-bit/8-byte aligned (the size of a CS instruction) * * Can be zero if stream_addr is zero too. + * + * When the stream size is zero, the queue submit serves as a + * synchronization point. */ __u32 stream_size; @@ -822,6 +825,8 @@ struct drm_panthor_queue_submit { * ensure the GPU doesn't get garbage when reading the indirect command * stream buffers. If you want the cache flush to happen * unconditionally, pass a zero here. + * + * Ignored when stream_size is zero. */ __u32 latest_flush; diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 1446c3bae515..d425b83181df 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -776,7 +776,13 @@ struct drm_xe_gem_create { #define DRM_XE_GEM_CPU_CACHING_WC 2 /** * @cpu_caching: The CPU caching mode to select for this object. If - * mmaping the object the mode selected here will also be used. + * mmaping the object the mode selected here will also be used. The + * exception is when mapping system memory (including data evicted + * to system) on discrete GPUs. The caching mode selected will + * then be overridden to DRM_XE_GEM_CPU_CACHING_WB, and coherency + * between GPU- and CPU is guaranteed. The caching mode of + * existing CPU-mappings will be updated transparently to + * user-space clients. */ __u16 cpu_caching; /** @pad: MBZ */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 45e4e64fd664..191a7e88a8ab 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -329,9 +329,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO negation of O_APPEND */ #define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 994bf7af0efe..2aaf7ee256ac 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -257,6 +257,8 @@ enum io_uring_op { IORING_OP_FUTEX_WAITV, IORING_OP_FIXED_FD_INSTALL, IORING_OP_FTRUNCATE, + IORING_OP_BIND, + IORING_OP_LISTEN, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index ad5478dbad00..225bc366ffcb 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -154,7 +154,7 @@ struct mount_attr { */ struct statmount { __u32 size; /* Total size, including strings */ - __u32 __spare1; + __u32 mnt_opts; /* [str] Mount options of the mount */ __u64 mask; /* What results were written */ __u32 sb_dev_major; /* Device ID */ __u32 sb_dev_minor; @@ -172,7 +172,8 @@ struct statmount { __u64 propagate_from; /* Propagation from in current namespace */ __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ - __u64 __spare2[50]; + __u64 mnt_ns_id; /* ID of the mount namespace */ + __u64 __spare2[49]; char str[]; /* Variable size part containing strings */ }; @@ -188,10 +189,12 @@ struct mnt_id_req { __u32 spare; __u64 mnt_id; __u64 param; + __u64 mnt_ns_id; }; /* List of all mnt_id_req versions. */ #define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ +#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ /* * @mask bits for statmount(2) @@ -202,10 +205,13 @@ struct mnt_id_req { #define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ #define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ +#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ +#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ /* * Special @mnt_id values that can be passed to listmount */ #define LSMT_ROOT 0xffffffffffffffff /* root mount */ +#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index a0c8552b64ee..b133211331f6 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -15,5 +15,15 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) +/* Get the id for a mount namespace */ +#define NS_GET_MNTNS_ID _IO(NSIO, 0x5) +/* Translate pid from target pid namespace into the caller's pid namespace. */ +#define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) +/* Return thread-group leader id of pid in the callers pid namespace. */ +#define NS_GET_TGID_FROM_PIDNS _IOR(NSIO, 0x7, int) +/* Translate pid from caller's pid namespace into a target pid namespace. */ +#define NS_GET_PID_IN_PIDNS _IOR(NSIO, 0x8, int) +/* Return thread-group leader id of pid in the target pid namespace. */ +#define NS_GET_TGID_IN_PIDNS _IOR(NSIO, 0x9, int) #endif /* __LINUX_NSFS_H */ diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 72ec000a97cd..565fc0629fff 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/fcntl.h> +#include <linux/ioctl.h> /* Flags for pidfd_open(). */ #define PIDFD_NONBLOCK O_NONBLOCK @@ -15,4 +16,17 @@ #define PIDFD_SIGNAL_THREAD_GROUP (1UL << 1) #define PIDFD_SIGNAL_PROCESS_GROUP (1UL << 2) +#define PIDFS_IOCTL_MAGIC 0xFF + +#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1) +#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2) +#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3) +#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4) +#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5) +#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6) +#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7) +#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8) +#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9) +#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10) + #endif /* _UAPI_LINUX_PIDFD_H */ diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 95770941ee2c..887a25286441 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -128,7 +128,13 @@ struct statx { __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ /* 0xa0 */ __u64 stx_subvol; /* Subvolume identifier */ - __u64 __spare3[11]; /* Spare space for future expansion */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 __spare1[1]; + /* 0xb8 */ + __u64 __spare3[9]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -157,6 +163,7 @@ struct statx { #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -192,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h index bd1066754220..c102ef35d11e 100644 --- a/include/uapi/linux/trace_mmap.h +++ b/include/uapi/linux/trace_mmap.h @@ -43,6 +43,6 @@ struct trace_buffer_meta { __u64 Reserved2; }; -#define TRACE_MMAP_IOCTL_GET_READER _IO('T', 0x1) +#define TRACE_MMAP_IOCTL_GET_READER _IO('R', 0x20) #endif /* _TRACE_MMAP_H_ */ diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index f33d914d8f46..91583690bddc 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -8,11 +8,14 @@ #define FASTRPC_IOCTL_ALLOC_DMA_BUFF _IOWR('R', 1, struct fastrpc_alloc_dma_buf) #define FASTRPC_IOCTL_FREE_DMA_BUFF _IOWR('R', 2, __u32) #define FASTRPC_IOCTL_INVOKE _IOWR('R', 3, struct fastrpc_invoke) +/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_ATTACH _IO('R', 4) #define FASTRPC_IOCTL_INIT_CREATE _IOWR('R', 5, struct fastrpc_init_create) #define FASTRPC_IOCTL_MMAP _IOWR('R', 6, struct fastrpc_req_mmap) #define FASTRPC_IOCTL_MUNMAP _IOWR('R', 7, struct fastrpc_req_munmap) +/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_ATTACH_SNS _IO('R', 8) +/* This ioctl is only supported with secure device nodes */ #define FASTRPC_IOCTL_INIT_CREATE_STATIC _IOWR('R', 9, struct fastrpc_init_create_static) #define FASTRPC_IOCTL_MEM_MAP _IOWR('R', 10, struct fastrpc_mem_map) #define FASTRPC_IOCTL_MEM_UNMAP _IOWR('R', 11, struct fastrpc_mem_unmap) diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index d04d394db064..7647e0946f50 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -77,6 +77,10 @@ struct vdso_timestamp { * vdso_data will be accessed by 64 bit and compat code at the same time * so we should be careful before modifying this structure. * + * The ordering of the struct members is optimized to have fast access to the + * often required struct members which are related to CLOCK_REALTIME and + * CLOCK_MONOTONIC. This information is stored in the first cache lines. + * * @basetime is used to store the base time for the system wide time getter * VVAR page. * diff --git a/io_uring/Makefile b/io_uring/Makefile index fc1b23c524e8..61923e11c767 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,9 +4,9 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o net.o poll.o \ - uring_cmd.o openclose.o sqpoll.o \ - xattr.o nop.o fs.o splice.o sync.o \ - msg_ring.o advise.o openclose.o \ + eventfd.o uring_cmd.o openclose.o \ + sqpoll.o xattr.o nop.o fs.o splice.o \ + sync.o msg_ring.o advise.o openclose.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o diff --git a/io_uring/advise.c b/io_uring/advise.c index 7085804c513c..cb7b881665e5 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -17,14 +17,14 @@ struct io_fadvise { struct file *file; u64 offset; - u32 len; + u64 len; u32 advice; }; struct io_madvise { struct file *file; u64 addr; - u32 len; + u64 len; u32 advice; }; @@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; ma->addr = READ_ONCE(sqe->addr); - ma->len = READ_ONCE(sqe->len); + ma->len = READ_ONCE(sqe->off); + if (!ma->len) + ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); req->flags |= REQ_F_FORCE_ASYNC; return 0; @@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; fa->offset = READ_ONCE(sqe->off); - fa->len = READ_ONCE(sqe->len); + fa->len = READ_ONCE(sqe->addr); + if (!fa->len) + fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); if (io_fadvise_force_async(fa)) req->flags |= REQ_F_FORCE_ASYNC; diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c new file mode 100644 index 000000000000..b9384503a2b7 --- /dev/null +++ b/io_uring/eventfd.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/eventfd.h> +#include <linux/eventpoll.h> +#include <linux/io_uring.h> +#include <linux/io_uring_types.h> + +#include "io-wq.h" +#include "eventfd.h" + +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; + atomic_t refs; + atomic_t ops; +}; + +enum { + IO_EVENTFD_OP_SIGNAL_BIT, +}; + +static void io_eventfd_free(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} + +static void io_eventfd_do_signal(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + + if (atomic_dec_and_test(&ev_fd->refs)) + io_eventfd_free(rcu); +} + +void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd = NULL; + + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; + + guard(rcu)(); + + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + return; + if (!atomic_inc_not_zero(&ev_fd->refs)) + return; + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + goto out; + + if (likely(eventfd_signal_allowed())) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + } else { + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return; + } + } +out: + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx) +{ + bool skip; + + spin_lock(&ctx->completion_lock); + + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count + * only changing IFF a new CQE has been added to the CQ ring. There's + * no depedency on 1:1 relationship between how many times this + * function is called (and hence the eventfd count) and number of CQEs + * posted to the CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; + + io_eventfd_signal(ctx); +} + +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) +{ + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; + + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); + return ret; + } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + atomic_set(&ev_fd->refs, 1); + atomic_set(&ev_fd->ops, 0); + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return 0; +} + +int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + if (atomic_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); + return 0; + } + + return -ENXIO; +} diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h new file mode 100644 index 000000000000..d394f49c6321 --- /dev/null +++ b/io_uring/eventfd.h @@ -0,0 +1,8 @@ + +struct io_ring_ctx; +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async); +int io_eventfd_unregister(struct io_ring_ctx *ctx); + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx); diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 7d3316fe9bfc..f1e7c670add8 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -23,6 +23,7 @@ #include "io_uring.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) +#define WORKER_INIT_LIMIT 3 enum { IO_WORKER_F_UP = 0, /* up and active */ @@ -58,6 +59,7 @@ struct io_worker { unsigned long create_state; struct callback_head create_work; + int init_retries; union { struct rcu_head rcu; @@ -159,7 +161,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, struct io_wq_work *work) { - return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) @@ -451,7 +453,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) static inline unsigned int io_get_work_hash(struct io_wq_work *work) { - return work->flags >> IO_WQ_HASH_SHIFT; + return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; } static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) @@ -592,8 +594,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, next_hashed = wq_next_work(work); - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; + if (do_kill && + (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -744,7 +747,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data) return true; } -static inline bool io_should_retry_thread(long err) +static inline bool io_should_retry_thread(struct io_worker *worker, long err) { /* * Prevent perpetual task_work retry, if the task (or its group) is @@ -752,6 +755,8 @@ static inline bool io_should_retry_thread(long err) */ if (fatal_signal_pending(current)) return false; + if (worker->init_retries++ >= WORKER_INIT_LIMIT) + return false; switch (err) { case -EAGAIN: @@ -778,7 +783,7 @@ static void create_worker_cont(struct callback_head *cb) io_init_new_worker(wq, worker, tsk); io_worker_release(worker); return; - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); @@ -845,7 +850,7 @@ fail: tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { io_init_new_worker(wq, worker, tsk); - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; } else { @@ -891,7 +896,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); work = wq->free_work(work); } while (work); @@ -926,7 +931,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); - unsigned long work_flags = work->flags; + unsigned int work_flags = atomic_read(&work->flags); struct io_cb_cancel_data match = { .fn = io_wq_work_match_item, .data = work, @@ -939,7 +944,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) * been marked as one that should not get executed, cancel it here. */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { + (work_flags & IO_WQ_WORK_CANCEL)) { io_run_cancel(work, wq); return; } @@ -982,7 +987,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags); } static bool __io_wq_worker_cancel(struct io_worker *worker, @@ -990,7 +995,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, struct io_wq_work *work) { if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); __set_notify_signal(worker->task); return true; } diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 2b2a6406dd8e..b3b004a7b625 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void); static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return work->flags & IO_WQ_WORK_HASHED; + return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c326e2127dd4..8e6faa942a6f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,12 +95,14 @@ #include "futex.h" #include "napi.h" #include "uring_cmd.h" +#include "msg_ring.h" #include "memmap.h" #include "timeout.h" #include "poll.h" #include "rw.h" #include "alloc_cache.h" +#include "eventfd.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -314,6 +316,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) sizeof(struct io_async_rw)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, sizeof(struct uring_cache)); + spin_lock_init(&ctx->msg_lock); + ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_kiocb)); ret |= io_futex_cache_init(ctx); if (ret) goto err; @@ -350,6 +355,7 @@ err: io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); @@ -461,9 +467,9 @@ static void io_prep_async_work(struct io_kiocb *req) } req->work.list.next = NULL; - req->work.flags = 0; + atomic_set(&req->work.flags, 0); if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; + atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); @@ -479,7 +485,7 @@ static void io_prep_async_work(struct io_kiocb *req) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; + atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); } } @@ -519,7 +525,7 @@ static void io_queue_iowq(struct io_kiocb *req) * worker for it). */ if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); @@ -541,84 +547,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void io_eventfd_ops(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - int ops = atomic_xchg(&ev_fd->ops, 0); - - if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - - /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback - * ordering in a race but if references are 0 we know we have to free - * it regardless. - */ - if (atomic_dec_and_test(&ev_fd->refs)) { - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); - } -} - -static void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd = NULL; - - rcu_read_lock(); - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; - - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - atomic_inc(&ev_fd->refs); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) - call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); - else - atomic_dec(&ev_fd->refs); - } - -out: - rcu_read_unlock(); -} - -static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; - - io_eventfd_signal(ctx); -} - void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) @@ -878,20 +806,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags) { bool filled; - io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + return filled; +} + +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + bool filled; + + io_cq_lock(ctx); + filled = __io_post_aux_cqe(ctx, user_data, res, cflags); io_cq_unlock_post(ctx); return filled; } /* + * Must be called from inline task_work so we now a flush will happen later, + * and obviously with ctx->uring_lock held (tw always has that). + */ +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + spin_lock(&ctx->completion_lock); + io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + spin_unlock(&ctx->completion_lock); + } + ctx->submit_state.cq_flush = true; +} + +/* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ @@ -1175,9 +1126,10 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } -static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +static inline void io_req_local_work_add(struct io_kiocb *req, + struct io_ring_ctx *ctx, + unsigned flags) { - struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1191,6 +1143,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; + guard(rcu)(); + head = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; @@ -1272,13 +1226,18 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - rcu_read_lock(); - io_req_local_work_add(req, flags); - rcu_read_unlock(); - } else { + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_req_local_work_add(req, req->ctx, flags); + else io_req_normal_work_add(req); - } +} + +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags) +{ + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + return; + io_req_local_work_add(req, ctx, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) @@ -1467,7 +1426,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) } __io_cq_unlock_post(ctx); - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (!wq_list_empty(&state->compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } @@ -1813,14 +1772,14 @@ void io_wq_submit_work(struct io_wq_work *work) io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { + if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); goto fail; } @@ -2649,6 +2608,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 726e6367af4d..e1ce908f0679 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -65,6 +65,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); @@ -73,6 +74,8 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); @@ -104,12 +107,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -enum { - IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, -}; - -void io_eventfd_ops(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 81c4a9d43729..29fa9285a33d 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,9 +11,9 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" +#include "alloc_cache.h" #include "msg_ring.h" - /* All valid masks for MSG_RING */ #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ IORING_MSG_RING_FLAGS_PASS) @@ -68,59 +68,70 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - if (!target_ctx->task_complete) - return false; - return current != target_ctx->submitter_task; + return target_ctx->task_complete; } -static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) +static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) { - struct io_ring_ctx *ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - struct task_struct *task = READ_ONCE(ctx->submitter_task); + struct io_ring_ctx *ctx = req->ctx; - if (unlikely(!task)) - return -EOWNERDEAD; + io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); + if (spin_trylock(&ctx->msg_lock)) { + if (io_alloc_cache_put(&ctx->msg_cache, req)) + req = NULL; + spin_unlock(&ctx->msg_lock); + } + if (req) + kmem_cache_free(req_cachep, req); + percpu_ref_put(&ctx->refs); +} - init_task_work(&msg->tw, func); - if (task_work_add(task, &msg->tw, TWA_SIGNAL)) +static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, + int res, u32 cflags, u64 user_data) +{ + req->task = READ_ONCE(ctx->submitter_task); + if (!req->task) { + kmem_cache_free(req_cachep, req); return -EOWNERDEAD; + } + req->cqe.user_data = user_data; + io_req_set_res(req, res, cflags); + percpu_ref_get(&ctx->refs); + req->ctx = ctx; + req->io_task_work.func = io_msg_tw_complete; + io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); + return 0; +} - return IOU_ISSUE_SKIP_COMPLETE; +static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req = NULL; + + if (spin_trylock(&ctx->msg_lock)) { + req = io_alloc_cache_get(&ctx->msg_cache); + spin_unlock(&ctx->msg_lock); + } + if (req) + return req; + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN); } -static void io_msg_tw_complete(struct callback_head *head) +static int io_msg_data_remote(struct io_kiocb *req) { - struct io_msg *msg = container_of(head, struct io_msg, tw); - struct io_kiocb *req = cmd_to_io_kiocb(msg); struct io_ring_ctx *target_ctx = req->file->private_data; - int ret = 0; - - if (current->flags & PF_EXITING) { - ret = -EOWNERDEAD; - } else { - u32 flags = 0; - - if (msg->flags & IORING_MSG_RING_FLAGS_PASS) - flags = msg->cqe_flags; - - /* - * If the target ring is using IOPOLL mode, then we need to be - * holding the uring_lock for posting completions. Other ring - * types rely on the regular completion locking, which is - * handled while posting. - */ - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&target_ctx->uring_lock); - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = -EOVERFLOW; - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&target_ctx->uring_lock); - } + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct io_kiocb *target; + u32 flags = 0; - if (ret < 0) - req_set_fail(req); - io_req_queue_tw_complete(req, ret); + target = io_msg_get_kiocb(req->ctx); + if (unlikely(!target)) + return -ENOMEM; + + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + + return io_msg_remote_post(target_ctx, target, msg->len, flags, + msg->user_data); } static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) @@ -138,7 +149,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return -EBADFD; if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_complete); + return io_msg_data_remote(req); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; @@ -218,6 +229,22 @@ static void io_msg_tw_fd_complete(struct callback_head *head) io_req_queue_tw_complete(req, ret); } +static int io_msg_fd_remote(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct task_struct *task = READ_ONCE(ctx->submitter_task); + + if (unlikely(!task)) + return -EOWNERDEAD; + + init_task_work(&msg->tw, io_msg_tw_fd_complete); + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) + return -EOWNERDEAD; + + return IOU_ISSUE_SKIP_COMPLETE; +} + static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; @@ -240,7 +267,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) } if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_fd_complete); + return io_msg_fd_remote(req); return io_msg_install_complete(req, issue_flags); } @@ -294,3 +321,10 @@ done: io_req_set_res(req, ret, 0); return IOU_OK; } + +void io_msg_cache_free(const void *entry) +{ + struct io_kiocb *req = (struct io_kiocb *) entry; + + kmem_cache_free(req_cachep, req); +} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 3987ee6c0e5f..3030f3942f0f 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -3,3 +3,4 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); +void io_msg_cache_free(const void *entry); diff --git a/io_uring/napi.c b/io_uring/napi.c index 8c18ede595c4..762254a7ff3f 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -283,7 +283,7 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow s64 poll_to_ns = timespec64_to_ns(ts); if (poll_to_ns > 0) { u64 val = poll_to_ns + 999; - do_div(val, (s64) 1000); + do_div(val, 1000); poll_to = val; } } diff --git a/io_uring/net.c b/io_uring/net.c index 7c98c4d50946..7b75da2e7826 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -51,6 +51,16 @@ struct io_connect { bool seen_econnaborted; }; +struct io_bind { + struct file *file; + int addr_len; +}; + +struct io_listen { + struct file *file; + int backlog; +}; + struct io_sr_msg { struct file *file; union { @@ -817,20 +827,20 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, bool mshot_finished, unsigned issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - unsigned int cflags; - - if (sr->flags & IORING_RECVSEND_BUNDLE) - cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), - issue_flags); - else - cflags = io_put_kbuf(req, issue_flags); + unsigned int cflags = 0; if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - /* bundle with no more immediate buffers, we're done */ - if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) - goto finish; + if (sr->flags & IORING_RECVSEND_BUNDLE) { + cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), + issue_flags); + /* bundle with no more immediate buffers, we're done */ + if (req->flags & REQ_F_BL_EMPTY) + goto finish; + } else { + cflags |= io_put_kbuf(req, issue_flags); + } /* * Fill CQE for this receive and see if we should keep trying to @@ -1127,16 +1137,18 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: - kmsg->msg.msg_inq = -1; - kmsg->msg.msg_flags = 0; - if (io_do_buffer_select(req)) { ret = io_recv_buf_select(req, kmsg, &len, issue_flags); - if (unlikely(ret)) + if (unlikely(ret)) { + kmsg->msg.msg_inq = -1; goto out_free; + } sr->buf = NULL; } + kmsg->msg.msg_flags = 0; + kmsg->msg.msg_inq = -1; + if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); @@ -1715,6 +1727,70 @@ out: return IOU_OK; } +int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); + struct sockaddr __user *uaddr; + struct io_async_msghdr *io; + + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + bind->addr_len = READ_ONCE(sqe->addr2); + + io = io_msg_alloc_async(req); + if (unlikely(!io)) + return -ENOMEM; + return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); +} + +int io_bind(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); + struct io_async_msghdr *io = req->async_data; + struct socket *sock; + int ret; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return 0; +} + +int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); + + if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) + return -EINVAL; + + listen->backlog = READ_ONCE(sqe->len); + return 0; +} + +int io_listen(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); + struct socket *sock; + int ret; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_listen_socket(sock, listen->backlog); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return 0; +} + void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; diff --git a/io_uring/net.h b/io_uring/net.h index 0eb1c1920fc9..52bfee05f06a 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -49,6 +49,12 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); +int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_bind(struct io_kiocb *req, unsigned int issue_flags); + +int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_listen(struct io_kiocb *req, unsigned int issue_flags); + void io_netmsg_cache_free(const void *entry); #else static inline void io_netmsg_cache_free(const void *entry) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 2e3b7b16effb..a2be3bbca5ff 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -495,6 +495,26 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_ftruncate_prep, .issue = io_ftruncate, }, + [IORING_OP_BIND] = { +#if defined(CONFIG_NET) + .needs_file = 1, + .prep = io_bind_prep, + .issue = io_bind, + .async_size = sizeof(struct io_async_msghdr), +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_LISTEN] = { +#if defined(CONFIG_NET) + .needs_file = 1, + .prep = io_listen_prep, + .issue = io_listen, + .async_size = sizeof(struct io_async_msghdr), +#else + .prep = io_eopnotsupp_prep, +#endif + }, }; const struct io_cold_def io_cold_defs[] = { @@ -716,6 +736,12 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FTRUNCATE] = { .name = "FTRUNCATE", }, + [IORING_OP_BIND] = { + .name = "BIND", + }, + [IORING_OP_LISTEN] = { + .name = "LISTEN", + }, }; const char *io_uring_get_opcode(u8 opcode) @@ -725,6 +751,14 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } +bool io_uring_op_supported(u8 opcode) +{ + if (opcode < IORING_OP_LAST && + io_issue_defs[opcode].prep != io_eopnotsupp_prep) + return true; + return false; +} + void __init io_uring_optable_init(void) { int i; diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 7ee6f5aa90aa..14456436ff74 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -17,8 +17,6 @@ struct io_issue_def { unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; /* supports ioprio */ @@ -47,5 +45,7 @@ struct io_cold_def { extern const struct io_issue_def io_issue_defs[]; extern const struct io_cold_def io_cold_defs[]; +bool io_uring_op_supported(u8 opcode); + void io_uring_optable_init(void); #endif diff --git a/io_uring/register.c b/io_uring/register.c index c0010a66a6f2..e3c20be5a198 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -27,65 +27,11 @@ #include "cancel.h" #include "kbuf.h" #include "napi.h" +#include "eventfd.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - - spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - atomic_set(&ev_fd->refs, 1); - atomic_set(&ev_fd->ops, 0); - return 0; -} - -int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); - return 0; - } - - return -ENXIO; -} - static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { @@ -93,9 +39,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, size_t size; int i, ret; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; p = kzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; @@ -108,12 +55,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, goto out; p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (!io_issue_defs[i].not_supported) + if (io_uring_op_supported(i)) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 570bfa6a31aa..a860516bf448 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -85,31 +85,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); @@ -249,7 +224,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = io_run_task_work_sig(ctx); if (ret < 0) { - __set_current_state(TASK_RUNNING); + finish_wait(&ctx->rsrc_quiesce_wq, &we); mutex_lock(&ctx->uring_lock); if (list_empty(&ctx->rsrc_ref_list)) ret = 0; @@ -257,7 +232,6 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, } schedule(); - __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; } while (!list_empty(&ctx->rsrc_ref_list)); @@ -420,8 +394,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { + struct iovec __user *uvec = u64_to_user_ptr(up->data); u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); + struct iovec fast_iov, *iov; struct page *last_hpage = NULL; __u32 done; int i, err; @@ -435,21 +410,23 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu; u64 tag = 0; - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) + iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + err = PTR_ERR(iov); break; + } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } - err = io_buffer_validate(&iov); + err = io_buffer_validate(iov); if (err) break; - if (!iov.iov_base && tag) { + if (!iov->iov_base && tag) { err = -EINVAL; break; } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); + err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); if (err) break; @@ -971,8 +948,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, { struct page *last_hpage = NULL; struct io_rsrc_data *data; + struct iovec fast_iov, *iov = &fast_iov; + const struct iovec __user *uvec = (struct iovec * __user) arg; int i, ret; - struct iovec iov; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); @@ -989,24 +967,27 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } + if (!arg) + memset(iov, 0, sizeof(*iov)); + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) + iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + ret = PTR_ERR(iov); break; - ret = io_buffer_validate(&iov); + } + ret = io_buffer_validate(iov); if (ret) break; - } else { - memset(&iov, 0, sizeof(iov)); } - if (!iov.iov_base && *io_get_tag_slot(data, i)) { + if (!iov->iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], + ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], &last_hpage); if (ret) break; diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a2128459cb4..c004d21e2f12 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -772,7 +772,7 @@ static bool need_complete_io(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -787,7 +787,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) req->flags |= io_file_get_flags(file); kiocb->ki_flags = file->f_iocb_flags; - ret = kiocb_set_rw_flags(kiocb, rw->flags); + ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; kiocb->ki_flags |= IOCB_ALLOC_CACHE; @@ -832,8 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } - - ret = io_rw_init_file(req, FMODE_READ); + ret = io_rw_init_file(req, FMODE_READ, READ); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); @@ -1013,7 +1012,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret, ret2; loff_t *ppos; - ret = io_rw_init_file(req, FMODE_WRITE); + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); diff --git a/io_uring/statx.c b/io_uring/statx.c index abb874209caa..f7f9b202eec0 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -37,8 +37,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sx->flags = READ_ONCE(sqe->statx_flags); sx->filename = getname_flags(path, - getname_statx_lookup_flags(sx->flags), - NULL); + getname_statx_lookup_flags(sx->flags)); if (IS_ERR(sx->filename)) { int ret = PTR_ERR(sx->filename); diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 44905b82eea8..6cf41c3bc369 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -96,7 +96,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + ix->filename = getname_flags(path, LOOKUP_FOLLOW); if (IS_ERR(ix->filename)) { ret = PTR_ERR(ix->filename); ix->filename = NULL; @@ -189,7 +189,7 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + ix->filename = getname_flags(path, LOOKUP_FOLLOW); if (IS_ERR(ix->filename)) { ret = PTR_ERR(ix->filename); ix->filename = NULL; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 5eea4dc0509e..a7cbd69efbef 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -903,7 +903,8 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, audit_mq_open(oflag, mode, attr); - if (IS_ERR(name = getname(u_name))) + name = getname(u_name); + if (IS_ERR(name)) return PTR_ERR(name); fd = get_unused_fd_flags(O_CLOEXEC); diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 976cb258a0ed..c938dea5ddbf 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), - nbuckets, GFP_USER | __GFP_NOWARN); + smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, + sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); if (!smap->buckets) { err = -ENOMEM; goto free_smap; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a69a9a36c0f..3243c83ef3e3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1084,7 +1084,10 @@ struct bpf_async_cb { struct bpf_prog *prog; void __rcu *callback_fn; void *value; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct delete_work; + }; u64 flags; }; @@ -1107,6 +1110,7 @@ struct bpf_async_cb { struct bpf_hrtimer { struct bpf_async_cb cb; struct hrtimer timer; + atomic_t cancelling; }; struct bpf_work { @@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work) kfree_rcu(w, cb.rcu); } +static void bpf_timer_delete_work(struct work_struct *work) +{ + struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); + + /* Cancel the timer and wait for callback to complete if it was running. + * If hrtimer_cancel() can be safely called it's safe to call + * kfree_rcu(t) right after for both preallocated and non-preallocated + * maps. The async->cb = NULL was already done and no code path can see + * address 't' anymore. Timer if armed for existing bpf_hrtimer before + * bpf_timer_cancel_and_free will have been cancelled. + */ + hrtimer_cancel(&t->timer); + kfree_rcu(t, cb.rcu); +} + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { @@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; + atomic_set(&t->cancelling, 0); + INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; @@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async) BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { - struct bpf_hrtimer *t; + struct bpf_hrtimer *t, *cur_t; + bool inc = false; int ret = 0; if (in_nmi()) @@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) ret = -EINVAL; goto out; } - if (this_cpu_read(hrtimer_running) == t) { + + cur_t = this_cpu_read(hrtimer_running); + if (cur_t == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock - * since it waits for callback_fn to finish + * since it waits for callback_fn to finish. */ ret = -EDEADLK; goto out; } + + /* Only account in-flight cancellations when invoked from a timer + * callback, since we want to avoid waiting only if other _callbacks_ + * are waiting on us, to avoid introducing lockups. Non-callback paths + * are ok, since nobody would synchronously wait for their completion. + */ + if (!cur_t) + goto drop; + atomic_inc(&t->cancelling); + /* Need full barrier after relaxed atomic_inc */ + smp_mb__after_atomic(); + inc = true; + if (atomic_read(&cur_t->cancelling)) { + /* We're cancelling timer t, while some other timer callback is + * attempting to cancel us. In such a case, it might be possible + * that timer t belongs to the other callback, or some other + * callback waiting upon it (creating transitive dependencies + * upon us), and we will enter a deadlock if we continue + * cancelling and waiting for it synchronously, since it might + * do the same. Bail! + */ + ret = -EDEADLK; + goto out; + } +drop: drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); @@ -1467,6 +1516,8 @@ out: * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); + if (inc) + atomic_dec(&t->cancelling); rcu_read_unlock(); return ret; } @@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val) if (!t) return; - /* Cancel the timer and wait for callback to complete if it was running. - * If hrtimer_cancel() can be safely called it's safe to call kfree(t) - * right after for both preallocated and non-preallocated maps. - * The async->cb = NULL was already done and no code path can - * see address 't' anymore. - * - * Check that bpf_map_delete/update_elem() wasn't called from timer - * callback_fn. In such case don't call hrtimer_cancel() (since it will - * deadlock) and don't call hrtimer_try_to_cancel() (since it will just - * return -1). Though callback_fn is still running on this cpu it's + /* We check that bpf_map_delete/update_elem() was called from timer + * callback_fn. In such case we don't call hrtimer_cancel() (since it + * will deadlock) and don't call hrtimer_try_to_cancel() (since it will + * just return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. + * + * However, it is possible the timer callback_fn calling us armed the + * timer _before_ calling us, such that failing to cancel it here will + * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. + * Therefore, we _need_ to cancel any outstanding timers before we do + * kfree_rcu, even though no more timers can be armed. + * + * Moreover, we need to schedule work even if timer does not belong to + * the calling callback_fn, as on two different CPUs, we can end up in a + * situation where both sides run in parallel, try to cancel one + * another, and we end up waiting on both sides in hrtimer_cancel + * without making forward progress, since timer1 depends on time2 + * callback to finish, and vice versa. + * + * CPU 1 (timer1_cb) CPU 2 (timer2_cb) + * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) + * + * To avoid these issues, punt to workqueue context when we are in a + * timer callback. */ - if (this_cpu_read(hrtimer_running) != t) - hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + if (this_cpu_read(hrtimer_running)) + queue_work(system_unbound_wq, &t->cb.delete_work); + else + bpf_timer_delete_work(&t->cb.delete_work); } /* This function is called by map_delete/update_elem for individual element and diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e32b6972c478..c8e4b62b436a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1744,8 +1744,11 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (cgroup_psi_enabled()) { ret = cgroup_addrm_files(css, cgrp, cgroup_psi_files, true); - if (ret < 0) + if (ret < 0) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); return ret; + } } } else { ret = cgroup_addrm_files(css, cgrp, @@ -1839,9 +1842,9 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); + css->cgroup = dcgrp; WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], e_cset_node[ss->id]) { @@ -1922,6 +1925,7 @@ enum cgroup2_param { Opt_memory_localevents, Opt_memory_recursiveprot, Opt_memory_hugetlb_accounting, + Opt_pids_localevents, nr__cgroup2_params }; @@ -1931,6 +1935,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), + fsparam_flag("pids_localevents", Opt_pids_localevents), {} }; @@ -1960,6 +1965,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_hugetlb_accounting: ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; return 0; + case Opt_pids_localevents: + ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + return 0; } return -EINVAL; } @@ -1989,6 +1997,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + + if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; } } @@ -2004,6 +2017,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_recursiveprot"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) seq_puts(seq, ",memory_hugetlb_accounting"); + if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + seq_puts(seq, ",pids_localevents"); return 0; } @@ -6686,8 +6701,10 @@ void cgroup_exit(struct task_struct *tsk) WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + /* matches the signal->live check in css_task_iter_advance() */ + if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) + list_add_tail(&tsk->cg_list, &cset->dying_tasks); if (dl_task(tsk)) dec_dl_tasks_cs(tsk); @@ -6714,10 +6731,12 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); + if (!list_empty(&task->cg_list)) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } } void cgroup_free(struct task_struct *task) @@ -7062,7 +7081,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n" - "memory_hugetlb_accounting\n"); + "memory_hugetlb_accounting\n" + "pids_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c12b9fdb22a4..40ec4abaf440 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,6 +21,7 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cgroup-internal.h" #include <linux/cpu.h> #include <linux/cpumask.h> @@ -87,7 +88,7 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", - [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", }; @@ -127,19 +128,28 @@ struct cpuset { /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * - * This exclusive CPUs must be a subset of cpus_allowed. A parent - * cgroup can only grant exclusive CPUs to one of its children. + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. * - * When the cgroup becomes a valid partition root, effective_xcpus - * defaults to cpus_allowed if not set. The effective_cpus of a valid - * partition root comes solely from its effective_xcpus and some of the - * effective_xcpus may be distributed to sub-partitions below & hence - * excluded from its effective_cpus. + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. */ cpumask_var_t effective_xcpus; /* * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. */ cpumask_var_t exclusive_cpus; @@ -169,7 +179,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid sub-partitions */ + /* number of valid local child partitions */ int nr_subparts; /* partition root state */ @@ -230,6 +240,17 @@ static struct list_head remote_children; * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -434,7 +455,7 @@ static struct cpuset top_cpuset = { * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_lock across + * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * @@ -709,6 +730,19 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); +} + static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : @@ -825,17 +859,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * If either I or some sibling (!= me) is exclusive, we can't - * overlap + * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur) { + bool txset, cxset; /* Are exclusive_cpus set? */ + + if (c == cur) + continue; + + txset = !cpumask_empty(trial->exclusive_cpus); + cxset = !cpumask_empty(c->exclusive_cpus); + if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || + (txset && cxset)) { if (!cpusets_are_exclusive(trial, c)) goto out; + } else if (txset || cxset) { + struct cpumask *xcpus, *acpus; + + /* + * When just one of the exclusive_cpus's is set, + * cpus_allowed of the other cpuset, if set, cannot be + * a subset of it or none of those CPUs will be + * available if these exclusive CPUs are activated. + */ + if (txset) { + xcpus = trial->exclusive_cpus; + acpus = c->cpus_allowed; + } else { + xcpus = c->exclusive_cpus; + acpus = trial->cpus_allowed; + } + if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) + goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } @@ -957,13 +1015,15 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -991,16 +1051,18 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -1008,20 +1070,39 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; - if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_valid(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) csa[i]->pn = i; ndoms = csn; @@ -1064,6 +1145,20 @@ restart: dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; + } + goto done; + } + for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; struct cpumask *dp; @@ -1223,7 +1318,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1338,7 +1433,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, */ static int update_partition_exclusive(struct cpuset *cs, int new_prs) { - bool exclusive = (new_prs > 0); + bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) @@ -1532,7 +1627,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * Return: true if xcpus is not empty, false otherwise. * * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of cpus_allowed and parent's effective_xcpus. + * it must be a subset of parent's effective_xcpus. */ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) @@ -1542,12 +1637,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, if (!xcpus) xcpus = cs->effective_xcpus; - if (!cpumask_empty(cs->exclusive_cpus)) - cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); - else - cpumask_copy(xcpus, cs->cpus_allowed); - - return cpumask_and(xcpus, xcpus, parent->effective_xcpus); + return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); } static inline bool is_remote_partition(struct cpuset *cs) @@ -1826,8 +1916,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = !cpumask_empty(cs->exclusive_cpus) - ? cs->effective_xcpus : cs->cpus_allowed; + xcpus = user_xcpus(cs); if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1855,7 +1944,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if (!newmask && cpumask_empty(cs->cpus_allowed)) + if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); @@ -2583,8 +2672,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; - if (!is_cpu_exclusive(cs)) - set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); } /* Nothing to do if the CPUs didn't change */ @@ -3071,9 +3158,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) ? partcmd_enable : partcmd_enablei; /* - * cpus_allowed cannot be empty. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) { + if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; } @@ -4009,8 +4096,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) } __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - nodes_clear(cs->mems_allowed); - nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; INIT_LIST_HEAD(&cs->remote_sibling); @@ -4040,6 +4125,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -4050,14 +4141,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->use_parent_ecpus = true; parent->child_ecpus_count++; } - - /* - * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -4571,7 +4654,7 @@ static void cpuset_handle_hotplug(void) * In the rare case that hotplug removes all the cpus in * subpartitions_cpus, we assumed that cpus are updated. */ - if (!cpus_updated && top_cpuset.nr_subparts) + if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) cpus_updated = true; /* For v1, synchronize cpus_allowed to cpu_active_mask */ @@ -5051,10 +5134,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 79a3717a5803..0e26068995a6 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -121,6 +121,30 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, misc_res_name[type]); } +static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage) +{ + u64 old; + + while (true) { + old = atomic64_read(&res->watermark); + if (new_usage <= old) + break; + if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old) + break; + } +} + +static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg) +{ + atomic64_inc(&cg->res[type].events_local); + cgroup_file_notify(&cg->events_local_file); + + for (; parent_misc(cg); cg = parent_misc(cg)) { + atomic64_inc(&cg->res[type].events); + cgroup_file_notify(&cg->events_file); + } +} + /** * misc_cg_try_charge() - Try charging the misc cgroup. * @type: Misc res type to charge. @@ -159,14 +183,12 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) ret = -EBUSY; goto err_charge; } + misc_cg_update_watermark(res, new_usage); } return 0; err_charge: - for (j = i; j; j = parent_misc(j)) { - atomic64_inc(&j->res[type].events); - cgroup_file_notify(&j->events_file); - } + misc_cg_event(type, i); for (j = cg; j != i; j = parent_misc(j)) misc_cg_cancel_charge(type, j, amount); @@ -308,6 +330,29 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) } /** + * misc_cg_peak_show() - Show the peak usage of the misc cgroup. + * @sf: Interface file + * @v: Arguments passed + * + * Context: Any context. + * Return: 0 to denote successful print. + */ +static int misc_cg_peak_show(struct seq_file *sf, void *v) +{ + int i; + u64 watermark; + struct misc_cg *cg = css_misc(seq_css(sf)); + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + watermark = atomic64_read(&cg->res[i].watermark); + if (READ_ONCE(misc_res_capacity[i]) || watermark) + seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark); + } + + return 0; +} + +/** * misc_cg_capacity_show() - Show the total capacity of misc res on the host. * @sf: Interface file * @v: Arguments passed @@ -331,20 +376,33 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) return 0; } -static int misc_events_show(struct seq_file *sf, void *v) +static int __misc_events_show(struct seq_file *sf, bool local) { struct misc_cg *cg = css_misc(seq_css(sf)); u64 events; int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic64_read(&cg->res[i].events); + if (local) + events = atomic64_read(&cg->res[i].events_local); + else + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } +static int misc_events_show(struct seq_file *sf, void *v) +{ + return __misc_events_show(sf, false); +} + +static int misc_events_local_show(struct seq_file *sf, void *v) +{ + return __misc_events_show(sf, true); +} + /* Misc cgroup interface files */ static struct cftype misc_cg_files[] = { { @@ -358,6 +416,10 @@ static struct cftype misc_cg_files[] = { .seq_show = misc_cg_current_show, }, { + .name = "peak", + .seq_show = misc_cg_peak_show, + }, + { .name = "capacity", .seq_show = misc_cg_capacity_show, .flags = CFTYPE_ONLY_ON_ROOT, @@ -368,6 +430,12 @@ static struct cftype misc_cg_files[] = { .file_offset = offsetof(struct misc_cg, events_file), .seq_show = misc_events_show, }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct misc_cg, events_local_file), + .seq_show = misc_events_local_show, + }, {} }; diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 0e5ec7d59b4d..f5cb0ec45b9d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -38,6 +38,14 @@ #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" +enum pidcg_event { + /* Fork failed in subtree because this pids_cgroup limit was hit. */ + PIDCG_MAX, + /* Fork failed in this pids_cgroup because ancestor limit was hit. */ + PIDCG_FORKFAIL, + NR_PIDCG_EVENTS, +}; + struct pids_cgroup { struct cgroup_subsys_state css; @@ -49,11 +57,12 @@ struct pids_cgroup { atomic64_t limit; int64_t watermark; - /* Handle for "pids.events" */ + /* Handles for pids.events[.local] */ struct cgroup_file events_file; + struct cgroup_file events_local_file; - /* Number of times fork failed because limit was hit. */ - atomic64_t events_limit; + atomic64_t events[NR_PIDCG_EVENTS]; + atomic64_t events_local[NR_PIDCG_EVENTS]; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -148,12 +157,13 @@ static void pids_charge(struct pids_cgroup *pids, int num) * pids_try_charge - hierarchically try to charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge + * @fail: storage of pid cgroup causing the fail * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge * succeeded, otherwise -EAGAIN. */ -static int pids_try_charge(struct pids_cgroup *pids, int num) +static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail) { struct pids_cgroup *p, *q; @@ -166,9 +176,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > limit) + if (new > limit) { + *fail = p; goto revert; - + } /* * Not technically accurate if we go over limit somewhere up * the hierarchy, but that's tolerable for the watermark. @@ -229,6 +240,36 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) } } +static void pids_event(struct pids_cgroup *pids_forking, + struct pids_cgroup *pids_over_limit) +{ + struct pids_cgroup *p = pids_forking; + bool limit = false; + + /* Only log the first time limit is hit. */ + if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) { + pr_info("cgroup: fork rejected by pids controller in "); + pr_cont_cgroup_path(p->css.cgroup); + pr_cont("\n"); + } + cgroup_file_notify(&p->events_local_file); + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + return; + + for (; parent_pids(p); p = parent_pids(p)) { + if (p == pids_over_limit) { + limit = true; + atomic64_inc(&p->events_local[PIDCG_MAX]); + cgroup_file_notify(&p->events_local_file); + } + if (limit) + atomic64_inc(&p->events[PIDCG_MAX]); + + cgroup_file_notify(&p->events_file); + } +} + /* * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on cgroup_threadgroup_change_begin() held by the copy_process(). @@ -236,7 +277,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; - struct pids_cgroup *pids; + struct pids_cgroup *pids, *pids_over_limit; int err; if (cset) @@ -244,16 +285,10 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset) else css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - err = pids_try_charge(pids, 1); - if (err) { - /* Only log the first time events_limit is incremented. */ - if (atomic64_inc_return(&pids->events_limit) == 1) { - pr_info("cgroup: fork rejected by pids controller in "); - pr_cont_cgroup_path(css->cgroup); - pr_cont("\n"); - } - cgroup_file_notify(&pids->events_file); - } + err = pids_try_charge(pids, 1, &pids_over_limit); + if (err) + pids_event(pids, pids_over_limit); + return err; } @@ -337,11 +372,32 @@ static s64 pids_peak_read(struct cgroup_subsys_state *css, return READ_ONCE(pids->watermark); } -static int pids_events_show(struct seq_file *sf, void *v) +static int __pids_events_show(struct seq_file *sf, bool local) { struct pids_cgroup *pids = css_pids(seq_css(sf)); + enum pidcg_event pe = PIDCG_MAX; + atomic64_t *events; + + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + pe = PIDCG_FORKFAIL; + local = true; + } + events = local ? pids->events_local : pids->events; + + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe])); + return 0; +} - seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); +static int pids_events_show(struct seq_file *sf, void *v) +{ + __pids_events_show(sf, false); + return 0; +} + +static int pids_events_local_show(struct seq_file *sf, void *v) +{ + __pids_events_show(sf, true); return 0; } @@ -368,9 +424,42 @@ static struct cftype pids_files[] = { .file_offset = offsetof(struct pids_cgroup, events_file), .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "events.local", + .seq_show = pids_events_local_show, + .file_offset = offsetof(struct pids_cgroup, events_local_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ +}; + +static struct cftype pids_files_legacy[] = { + { + .name = "max", + .write = pids_max_write, + .seq_show = pids_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = pids_peak_read, + }, + { + .name = "events", + .seq_show = pids_events_show, + .file_offset = offsetof(struct pids_cgroup, events_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; + struct cgroup_subsys pids_cgrp_subsys = { .css_alloc = pids_css_alloc, .css_free = pids_css_free, @@ -379,7 +468,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, .release = pids_release, - .legacy_cftypes = pids_files, + .legacy_cftypes = pids_files_legacy, .dfl_cftypes = pids_files, .threaded = true, }; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index fb8b49437573..a06b45272411 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -594,49 +594,46 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) } } + +static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat) +{ +#ifdef CONFIG_SCHED_CORE + u64 forceidle_time = bstat->forceidle_sum; + + do_div(forceidle_time, NSEC_PER_USEC); + seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); +#endif +} + void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; - struct cgroup_base_stat bstat; -#ifdef CONFIG_SCHED_CORE - u64 forceidle_time; -#endif if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); -#ifdef CONFIG_SCHED_CORE - forceidle_time = cgrp->bstat.forceidle_sum; -#endif cgroup_rstat_flush_release(cgrp); } else { - root_cgroup_cputime(&bstat); - usage = bstat.cputime.sum_exec_runtime; - utime = bstat.cputime.utime; - stime = bstat.cputime.stime; -#ifdef CONFIG_SCHED_CORE - forceidle_time = bstat.forceidle_sum; -#endif + /* cgrp->bstat of root is not actually used, reuse it */ + root_cgroup_cputime(&cgrp->bstat); + usage = cgrp->bstat.cputime.sum_exec_runtime; + utime = cgrp->bstat.cputime.utime; + stime = cgrp->bstat.cputime.stime; } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); -#ifdef CONFIG_SCHED_CORE - do_div(forceidle_time, NSEC_PER_USEC); -#endif seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n", usage, utime, stime); -#ifdef CONFIG_SCHED_CORE - seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); -#endif + cgroup_force_idle_show(seq, &cgrp->bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 3d2bf1d50a0c..7121f058674a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1894,8 +1894,8 @@ int freeze_secondary_cpus(int primary) cpumask_clear(frozen_cpus); pr_info("Disabling non-boot CPUs ...\n"); - for_each_online_cpu(cpu) { - if (cpu == primary) + for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) { + if (!cpu_online(cpu) || cpu == primary) continue; if (pm_wakeup_pending()) { diff --git a/kernel/exit.c b/kernel/exit.c index f95a2c1338a8..81fcee45d630 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -484,6 +484,8 @@ retry: * Search through everything else, we should not get here often. */ for_each_process(g) { + if (atomic_read(&mm->mm_users) <= 1) + break; if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { diff --git a/kernel/fork.c b/kernel/fork.c index 99076dbe27d8..763a042eef9c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -616,12 +616,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) exe_file = get_mm_exe_file(oldmm); RCU_INIT_POINTER(mm->exe_file, exe_file); - /* - * We depend on the oldmm having properly denied write access to the - * exe_file already. - */ - if (exe_file && deny_write_access(exe_file)) - pr_warn_once("deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU @@ -1412,20 +1406,11 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) */ old_exe_file = rcu_dereference_raw(mm->exe_file); - if (new_exe_file) { - /* - * We expect the caller (i.e., sys_execve) to already denied - * write access, so this is unlikely to fail. - */ - if (unlikely(deny_write_access(new_exe_file))) - return -EACCES; + if (new_exe_file) get_file(new_exe_file); - } rcu_assign_pointer(mm->exe_file, new_exe_file); - if (old_exe_file) { - allow_write_access(old_exe_file); + if (old_exe_file) fput(old_exe_file); - } return 0; } @@ -1464,9 +1449,6 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) return ret; } - ret = deny_write_access(new_exe_file); - if (ret) - return -EACCES; get_file(new_exe_file); /* set the new file */ @@ -1475,10 +1457,8 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) rcu_assign_pointer(mm->exe_file, new_exe_file); mmap_write_unlock(mm); - if (old_exe_file) { - allow_write_access(old_exe_file); + if (old_exe_file) fput(old_exe_file); - } return 0; } diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 0c17b4c83e1c..117d9d4d3c3b 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1620,5 +1620,6 @@ static struct kunit_suite kcsan_test_suite = { kunit_test_suites(&kcsan_test_suite); +MODULE_DESCRIPTION("KCSAN test suite"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Marco Elver <[email protected]>"); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 415d81e6ce70..de95ec07e477 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -30,6 +30,7 @@ #include <linux/torture.h> #include <linux/reboot.h> +MODULE_DESCRIPTION("torture test facility for locking"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 25f3cf679b35..bdf0087d6442 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -249,24 +249,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; - /* - * Release tasks_rcu_exit_srcu to avoid following deadlock: - * - * 1) TASK A unshare(CLONE_NEWPID) - * 2) TASK A fork() twice -> TASK B (child reaper for new ns) - * and TASK C - * 3) TASK B exits, kills TASK C, waits for TASK A to reap it - * 4) TASK A calls synchronize_rcu_tasks() - * -> synchronize_srcu(tasks_rcu_exit_srcu) - * 5) *DEADLOCK* - * - * It is considered safe to release tasks_rcu_exit_srcu here - * because we assume the current task can not be concurrently - * reaped at this point. - */ - exit_tasks_rcu_stop(); schedule(); - exit_tasks_rcu_start(); } __set_current_state(TASK_RUNNING); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 8db4fedaaa1e..b53a9e8f5904 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -42,6 +42,7 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 807fbf6123a7..08bf7c669dd3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -51,6 +51,7 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <[email protected]> and Josh Triplett <[email protected]>"); @@ -390,6 +391,7 @@ struct rcu_torture_ops { int extendables; int slow_gps; int no_pi_lock; + int debug_objects; const char *name; }; @@ -577,6 +579,7 @@ static struct rcu_torture_ops rcu_ops = { .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, + .debug_objects = 1, .name = "rcu" }; @@ -747,6 +750,7 @@ static struct rcu_torture_ops srcu_ops = { .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, .name = "srcu" }; @@ -786,6 +790,7 @@ static struct rcu_torture_ops srcud_ops = { .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, .name = "srcud" }; @@ -2626,7 +2631,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); rfcpp = rfp->rcu_fwd_cb_tail; rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; - WRITE_ONCE(*rfcpp, rfcp); + smp_store_release(rfcpp, rfcp); WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(rfp->n_launders_hist)) @@ -3455,7 +3460,6 @@ rcu_torture_cleanup(void) cur_ops->gp_slow_unregister(NULL); } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD static void rcu_torture_leak_cb(struct rcu_head *rhp) { } @@ -3473,7 +3477,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ /* * Verify that double-free causes debug-objects to complain, but only @@ -3482,39 +3485,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ static void rcu_test_debug_objects(void) { -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + int idx; + + if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) { + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n", + KBUILD_MODNAME, cur_ops->name); + return; + } + + if (WARN_ON_ONCE(cur_ops->debug_objects && + (!cur_ops->call || !cur_ops->cb_barrier))) + return; + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name); /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu_hurry(&rh2, rcu_torture_leak_cb); - call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */ + cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + cur_ops->call(&rh2, rcu_torture_leak_cb); + cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ if (rhp) { - call_rcu_hurry(rhp, rcu_torture_leak_cb); - call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + cur_ops->call(rhp, rcu_torture_leak_cb); + cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ } - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); + cur_ops->readunlock(idx); /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); + cur_ops->cb_barrier(); + pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); kfree(rhp); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } static void rcutorture_sync(void) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 2c2648a3ad30..f4ea5b1ec068 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -63,6 +63,7 @@ do { \ #define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x) +MODULE_DESCRIPTION("Scalability test for object reference mechanisms"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) <[email protected]>"); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5afd5cf494db..549c03336ee9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -277,7 +277,8 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) unsigned long cur_s = READ_ONCE(ssp->srcu_idx); barrier(); - return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); + return cookie == SRCU_GET_STATE_COMPLETED || + ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index bc4b58b0204e..b24db425f16d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -667,7 +667,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); - return; /* Caller forgot to stop doing call_srcu()? */ + return; // Caller forgot to stop doing call_srcu()? + // Or caller invoked start_poll_synchronize_srcu() + // and then cleanup_srcu_struct() before that grace + // period ended? } kfree(sup->node); sup->node = NULL; @@ -845,7 +848,6 @@ static void srcu_gp_end(struct srcu_struct *ssp) bool cbs; bool last_lvl; int cpu; - unsigned long flags; unsigned long gpseq; int idx; unsigned long mask; @@ -907,12 +909,12 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (!(gpseq & counter_wrap_check)) for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irq_rcu_node(sdp); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irq_rcu_node(sdp); } /* Callback initiation done, allow grace periods after next. */ @@ -1540,7 +1542,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) + if (cookie != SRCU_GET_STATE_COMPLETED && + !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 6c2bd9001adc..da60a9947c00 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * we are called at early boot time but this shouldn't happen. */ } - WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1); + rsp->gp_count++; spin_unlock_irq(&rsp->rss_lock); if (gp_state == GP_IDLE) { @@ -151,15 +151,11 @@ void rcu_sync_enter(struct rcu_sync *rsp) */ void rcu_sync_exit(struct rcu_sync *rsp) { - int gpc; - WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); - WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); spin_lock_irq(&rsp->rss_lock); - gpc = rsp->gp_count - 1; - WRITE_ONCE(rsp->gp_count, gpc); - if (!gpc) { + WARN_ON_ONCE(rsp->gp_count == 0); + if (!--rsp->gp_count) { if (rsp->gp_state == GP_PASSED) { WRITE_ONCE(rsp->gp_state, GP_EXIT); rcu_sync_call(rsp); @@ -178,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp) { int gp_state; - WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); + WARN_ON_ONCE(rsp->gp_count); if (rsp->gp_state == GP_REPLAY) WRITE_ONCE(rsp->gp_state, GP_EXIT); gp_state = rsp->gp_state; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e1bf33018e6d..ba3440a45b6d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -858,7 +858,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // not know to synchronize with this RCU Tasks grace period) have // completed exiting. The synchronize_rcu() in rcu_tasks_postgp() // will take care of any tasks stuck in the non-preemptible region -// of do_exit() following its call to exit_tasks_rcu_stop(). +// of do_exit() following its call to exit_tasks_rcu_finish(). // check_all_holdout_tasks(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the @@ -1220,7 +1220,7 @@ void exit_tasks_rcu_start(void) * Remove the task from the "yet another list" because do_exit() is now * non-preemptible, allowing synchronize_rcu() to wait beyond this point. */ -void exit_tasks_rcu_stop(void) +void exit_tasks_rcu_finish(void) { unsigned long flags; struct rcu_tasks_percpu *rtpcp; @@ -1231,22 +1231,12 @@ void exit_tasks_rcu_stop(void) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); list_del_init(&t->rcu_tasks_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); -} -/* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. - */ -void exit_tasks_rcu_finish(void) -{ - exit_tasks_rcu_stop(); - exit_tasks_rcu_finish_trace(current); + exit_tasks_rcu_finish_trace(t); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } -void exit_tasks_rcu_stop(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ @@ -1757,6 +1747,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // allow safe access to the hop list. for_each_online_cpu(cpu) { rcu_read_lock(); + // Note that cpu_curr_snapshot() picks up the target + // CPU's current task while its runqueue is locked with + // an smp_mb__after_spinlock(). This ensures that either + // the grace-period kthread will see that task's read-side + // critical section or the task will see the updater's pre-GP + // accesses. The trailing smp_mb() in cpu_curr_snapshot() + // does not currently play a role other than simplify + // that function's ordering semantics. If these simplified + // ordering semantics continue to be redundant, that smp_mb() + // might be removed. t = cpu_curr_snapshot(cpu); if (rcu_tasks_trace_pertask_prep(t, true)) trc_add_holdout(t, hop); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 28c7031711a3..e641cc681901 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -96,6 +96,7 @@ static struct rcu_state rcu_state = { .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, rcu_sr_normal_gp_cleanup_work), + .srs_cleanups_pending = ATOMIC_INIT(0), }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -175,6 +176,9 @@ static int gp_init_delay; module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +static int nohz_full_patience_delay; +module_param(nohz_full_patience_delay, int, 0444); +static int nohz_full_patience_delay_jiffies; // Add delay to rcu_read_unlock() for strict grace periods. static int rcu_unlock_delay; @@ -296,16 +300,6 @@ static void rcu_dynticks_eqs_online(void) } /* - * Snapshot the ->dynticks counter with full ordering so as to allow - * stable comparison of this counter with past and future snapshots. - */ -static int rcu_dynticks_snap(int cpu) -{ - smp_mb(); // Fundamental RCU ordering guarantee. - return ct_dynticks_cpu_acquire(cpu); -} - -/* * Return true if the snapshot returned from rcu_dynticks_snap() * indicates that RCU is in an extended quiescent state. */ @@ -321,7 +315,15 @@ static bool rcu_dynticks_in_eqs(int snap) */ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdp->cpu); + /* + * The first failing snapshot is already ordered against the accesses + * performed by the remote CPU after it exits idle. + * + * The second snapshot therefore only needs to order against accesses + * performed by the remote CPU prior to entering idle and therefore can + * rely solely on acquire semantics. + */ + return snap != ct_dynticks_cpu_acquire(rdp->cpu); } /* @@ -769,7 +771,18 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu); + /* + * Full ordering between remote CPU's post idle accesses and updater's + * accesses prior to current GP (and also the started GP sequence number) + * is enforced by rcu_seq_start() implicit barrier and even further by + * smp_mb__after_unlock_lock() barriers chained all the way throughout the + * rnp locking tree since rcu_gp_init() and up to the current leaf rnp + * locking. + * + * Ordering between remote CPU's pre idle accesses and post grace period + * updater's accesses is enforced by the below acquire semantic. + */ + rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); @@ -1660,6 +1673,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) rcu_sr_put_wait_head(rcu); } + + /* Order list manipulations with atomic access. */ + atomic_dec_return_release(&rcu_state.srs_cleanups_pending); } /* @@ -1667,7 +1683,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) */ static void rcu_sr_normal_gp_cleanup(void) { - struct llist_node *wait_tail, *next, *rcu; + struct llist_node *wait_tail, *next = NULL, *rcu = NULL; int done = 0; wait_tail = rcu_state.srs_wait_tail; @@ -1693,16 +1709,34 @@ static void rcu_sr_normal_gp_cleanup(void) break; } - // concurrent sr_normal_gp_cleanup work might observe this update. - smp_store_release(&rcu_state.srs_done_tail, wait_tail); + /* + * Fast path, no more users to process except putting the second last + * wait head if no inflight-workers. If there are in-flight workers, + * they will remove the last wait head. + * + * Note that the ACQUIRE orders atomic access with list manipulation. + */ + if (wait_tail->next && wait_tail->next->next == NULL && + rcu_sr_is_wait_head(wait_tail->next) && + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) { + rcu_sr_put_wait_head(wait_tail->next); + wait_tail->next = NULL; + } + + /* Concurrent sr_normal_gp_cleanup work might observe this update. */ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + smp_store_release(&rcu_state.srs_done_tail, wait_tail); /* * We schedule a work in order to perform a final processing * of outstanding users(if still left) and releasing wait-heads * added by rcu_sr_normal_gp_init() call. */ - queue_work(sync_wq, &rcu_state.srs_cleanup_work); + if (wait_tail->next) { + atomic_inc(&rcu_state.srs_cleanups_pending); + if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) + atomic_dec(&rcu_state.srs_cleanups_pending); + } } /* @@ -1810,7 +1844,7 @@ static noinline_for_stack bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - local_irq_save(flags); + local_irq_disable(); arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1818,7 +1852,7 @@ static noinline_for_stack bool rcu_gp_init(void) /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); continue; } @@ -1855,7 +1889,7 @@ static noinline_for_stack bool rcu_gp_init(void) raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ @@ -4313,11 +4347,15 @@ static int rcu_pending(int user) return 1; /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ - if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu()) + gp_in_progress = rcu_gp_in_progress(); + if ((user || rcu_is_cpu_rrupt_from_idle() || + (gp_in_progress && + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + + nohz_full_patience_delay_jiffies))) && + rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - gp_in_progress = rcu_gp_in_progress(); if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) return 1; @@ -4767,7 +4805,7 @@ rcu_boot_init_percpu_data(int cpu) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(ct->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); + WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; @@ -5110,11 +5148,15 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); bool needwake; - if (rcu_rdp_is_offloaded(rdp) || - rcu_segcblist_empty(&rdp->cblist)) - return; /* No callbacks to migrate. */ + if (rcu_rdp_is_offloaded(rdp)) + return; raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (rcu_segcblist_empty(&rdp->cblist)) { + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + return; /* No callbacks to migrate. */ + } + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bae7925c497f..fcf2b4aa3441 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -223,7 +223,6 @@ struct rcu_data { struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ - atomic_t nocb_lock_contended; /* Contention experienced. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ @@ -420,6 +419,7 @@ struct rcu_state { struct llist_node *srs_done_tail; /* ready for GP users. */ struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; struct work_struct srs_cleanup_work; + atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8a1d9c8bd9f7..4acd29d16fdb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -265,7 +265,12 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - smp_mb(); /* Ensure test happens before caller kfree(). */ + /* + * Order GP completion with preceding accesses. Order also GP + * completion with post GP update side accesses. Pairs with + * rcu_seq_end(). + */ + smp_mb(); return true; } return false; @@ -357,7 +362,21 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(cpu); + /* + * Full ordering between remote CPU's post idle accesses + * and updater's accesses prior to current GP (and also + * the started GP sequence number) is enforced by + * rcu_seq_start() implicit barrier, relayed by kworkers + * locking and even further by smp_mb__after_unlock_lock() + * barriers chained all the way throughout the rnp locking + * tree since sync_exp_reset_tree() and up to the current + * leaf rnp locking. + * + * Ordering between remote CPU's pre idle accesses and + * post grace period updater's accesses is enforced by the + * below acquire semantic. + */ + snap = ct_dynticks_cpu_acquire(cpu); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else @@ -953,7 +972,6 @@ void synchronize_rcu_expedited(void) rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3f85577bddd4..3ce30841119a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -91,8 +91,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0); /* * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the - * lock isn't immediately available, increment ->nocb_lock_contended to - * flag the contention. + * lock isn't immediately available, perform minimal sanity check. */ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) __acquires(&rdp->nocb_bypass_lock) @@ -100,29 +99,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) lockdep_assert_irqs_disabled(); if (raw_spin_trylock(&rdp->nocb_bypass_lock)) return; - atomic_inc(&rdp->nocb_lock_contended); + /* + * Contention expected only when local enqueue collide with + * remote flush from kthreads. + */ WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - smp_mb__after_atomic(); /* atomic_inc() before lock. */ raw_spin_lock(&rdp->nocb_bypass_lock); - smp_mb__before_atomic(); /* atomic_dec() after lock. */ - atomic_dec(&rdp->nocb_lock_contended); -} - -/* - * Spinwait until the specified rcu_data structure's ->nocb_lock is - * not contended. Please note that this is extremely special-purpose, - * relying on the fact that at most two kthreads and one CPU contend for - * this lock, and also that the two kthreads are guaranteed to have frequent - * grace-period-duration time intervals between successive acquisitions - * of the lock. This allows us to use an extremely simple throttling - * mechanism, and further to apply it only to the CPU doing floods of - * call_rcu() invocations. Don't try this at home! - */ -static void rcu_nocb_wait_contended(struct rcu_data *rdp) -{ - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) - cpu_relax(); } /* @@ -510,7 +492,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } // We need to use the bypass. - rcu_nocb_wait_contended(rdp); rcu_nocb_bypass_lock(rdp); ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ @@ -635,8 +616,7 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp, - bool *wake_state) +static int nocb_gp_toggle_rdp(struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; @@ -650,8 +630,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, * We will handle this rdp until it ever gets de-offloaded. */ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; ret = 1; } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { @@ -660,8 +638,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, * We will ignore this rdp until it ever gets re-offloaded. */ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; ret = 0; } else { WARN_ON_ONCE(1); @@ -877,16 +853,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - bool wake_state = false; int ret; - ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); + ret = nocb_gp_toggle_rdp(rdp_toggling); if (ret == 1) list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); else if (ret == 0) list_del(&rdp_toggling->nocb_entry_rdp); - if (wake_state) - swake_up_one(&rdp_toggling->nocb_state_wq); + + swake_up_one(&rdp_toggling->nocb_state_wq); } my_rdp->nocb_gp_seq = -1; @@ -913,16 +888,9 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } -static inline bool nocb_cb_can_run(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) { - return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); + return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park(); } /* @@ -934,21 +902,19 @@ static void nocb_cb_wait(struct rcu_data *rdp) struct rcu_segcblist *cblist = &rdp->cblist; unsigned long cur_gp_seq; unsigned long flags; - bool needwake_state = false; bool needwake_gp = false; - bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - if (READ_ONCE(rdp->nocb_cb_sleep)) { - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + if (kthread_should_park()) { + kthread_parkme(); + } else if (READ_ONCE(rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -971,37 +937,16 @@ static void nocb_cb_wait(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - if (rcu_segcblist_ready_cbs(cblist)) - can_sleep = false; + if (!rcu_segcblist_ready_cbs(cblist)) { + WRITE_ONCE(rdp->nocb_cb_sleep, true); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We won't touch the callbacks and keep sleeping until we ever - * get re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; + WRITE_ONCE(rdp->nocb_cb_sleep, false); } - WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); - - if (rdp->nocb_cb_sleep) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); - rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } /* @@ -1094,17 +1039,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp, bool wake_gp = false; rcu_segcblist_offload(cblist, offload); - - if (rdp->nocb_cb_sleep) - rdp->nocb_cb_sleep = false; rcu_nocb_unlock_irqrestore(rdp, flags); - /* - * Ignore former value of nocb_cb_sleep and force wake up as it could - * have been spuriously set to false already. - */ - swake_up_one(&rdp->nocb_cb_wq); - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); @@ -1161,19 +1097,11 @@ static long rcu_nocb_rdp_deoffload(void *arg) if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - /* - * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. - * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. - */ - if (!rdp->nocb_cb_kthread) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_nocb_unlock_irqrestore(rdp, flags); - } - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + !rcu_segcblist_test_flags(cblist, + SEGCBLIST_KTHREAD_GP)); + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list @@ -1181,8 +1109,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) * but we stick to paranoia in this rare path. */ rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); rcu_nocb_unlock_irqrestore(rdp, flags); list_del(&rdp->nocb_entry_rdp); @@ -1282,8 +1209,10 @@ static long rcu_nocb_rdp_offload(void *arg) wake_gp = rdp_offload_toggle(rdp, true, flags); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); + + kthread_unpark(rdp->nocb_cb_kthread); + swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); /* @@ -1468,7 +1397,7 @@ void __init rcu_init_nohz(void) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); @@ -1526,11 +1455,16 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_cb_kthread, rdp, - "rcuo%c/%d", rcu_state.abbr, cpu); + t = kthread_create(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) goto end; + if (rcu_rdp_is_offloaded(rdp)) + wake_up_process(t); + else + kthread_park(t); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); @@ -1678,12 +1612,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], - "cC"[!!atomic_read(&rdp->nocb_lock_contended)], "lL"[raw_spin_is_locked(&rdp->nocb_lock)], "sS"[!!rdp->nocb_cb_sleep], ".W"[swait_active(&rdp->nocb_cb_wq)], diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 340bbefe5f65..c569da65b421 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,8 +28,8 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || rcu_lockdep_is_held_nocb(rdp) || - (rdp == this_cpu_ptr(&rcu_data) && - !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || + (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && + rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" ); @@ -93,6 +93,16 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); + if (nohz_full_patience_delay < 0) { + pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay); + nohz_full_patience_delay = 0; + } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) { + pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC); + nohz_full_patience_delay = 5 * MSEC_PER_SEC; + } else if (nohz_full_patience_delay) { + pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay); + } + nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 460efecd077b..4b0e9d7c4c68 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); + rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) // Print signed value, as negative values indicate a probable bug. @@ -515,7 +515,7 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(cpu) & 0xffff, + ct_dynticks_cpu(cpu) & 0xffff, ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 59032aaccd18..44e83a646264 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -43,6 +43,7 @@ #define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x) +MODULE_DESCRIPTION("Torture tests on the smp_call_function() family of primitives"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); @@ -67,7 +68,7 @@ torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operation torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); -char *torture_type = ""; +static char *torture_type = ""; #ifdef MODULE # define SCFTORT_SHUTDOWN 0 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bcf2c4cc0522..35a35e36024b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; - psi_account_irqtime(rq->curr, irq_delta); delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING @@ -4467,12 +4466,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) * @cpu: The CPU on which to snapshot the task. * * Returns the task_struct pointer of the task "currently" running on - * the specified CPU. If the same task is running on that CPU throughout, - * the return value will be a pointer to that task's task_struct structure. - * If the CPU did any context switches even vaguely concurrently with the - * execution of this function, the return value will be a pointer to the - * task_struct structure of a randomly chosen task that was running on - * that CPU somewhere around the time that this function was executing. + * the specified CPU. * * If the specified CPU was offline, the return value is whatever it * is, perhaps a pointer to the task_struct structure of that CPU's idle @@ -4486,11 +4480,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) */ struct task_struct *cpu_curr_snapshot(int cpu) { + struct rq *rq = cpu_rq(cpu); struct task_struct *t; + struct rq_flags rf; - smp_mb(); /* Pairing determined by caller's synchronization design. */ + rq_lock_irqsave(rq, &rf); + smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */ t = rcu_dereference(cpu_curr(cpu)); + rq_unlock_irqrestore(rq, &rf); smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; } @@ -5665,7 +5664,7 @@ void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; + struct task_struct *curr; struct rq_flags rf; unsigned long hw_pressure; u64 resched_latency; @@ -5677,6 +5676,9 @@ void sched_tick(void) rq_lock(rq, &rf); + curr = rq->curr; + psi_account_irqtime(rq, curr, NULL); + update_rq_clock(rq); hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); @@ -6737,6 +6739,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) ++*switch_count; migrate_disable_switch(rq, prev); + psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c75d1307d86d..9bedd148f007 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1804,8 +1804,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). + * + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). */ - hrtimer_try_to_cancel(&p->dl.dl_timer); + if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 && + !dl_server(&p->dl)) + put_task_struct(p); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a5b1ae0aa55..24dda708b699 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9149,12 +9149,8 @@ static int detach_tasks(struct lb_env *env) break; env->loop++; - /* - * We've more or less seen every task there is, call it quits - * unless we haven't found any movable task yet. - */ - if (env->loop > env->loop_max && - !(env->flags & LBF_ALL_PINNED)) + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) break; /* take a breather every nr_migrate tasks */ @@ -11393,9 +11389,7 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - /* Stop if we tried all running tasks */ - if (env.loop < busiest->nr_running) - goto more_balance; + goto more_balance; } /* diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7b4aa5809c0f..507d7b8d79af 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu, enum psi_states s; u32 state_mask; + lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* @@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } #ifdef CONFIG_IRQ_TIME_ACCOUNTING -void psi_account_irqtime(struct task_struct *task, u32 delta) +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { - int cpu = task_cpu(task); + int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now; + u64 now, irq; + s64 delta; if (static_branch_likely(&psi_disabled)) return; - if (!task->pid) + if (!curr->pid) + return; + + lockdep_assert_rq_held(rq); + group = task_psi_group(curr); + if (prev && task_psi_group(prev) == group) return; now = cpu_clock(cpu); + irq = irq_time_read(cpu); + delta = (s64)(irq - rq->psi_irq_time); + if (delta < 0) + return; + rq->psi_irq_time = irq; - group = task_psi_group(task); do { if (!group->enabled) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a831af102070..ef20c61004eb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1126,6 +1126,7 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; + u64 psi_irq_time; #endif #ifdef CONFIG_PARAVIRT u64 prev_steal_time; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b..b02dfc322951 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se) void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); -void psi_account_irqtime(struct task_struct *task, u32 delta); - +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); +#else +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} -static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO diff --git a/kernel/signal.c b/kernel/signal.c index 1f9dd41c04be..60c737e423a1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2600,6 +2600,14 @@ static void do_freezer_trap(void) spin_unlock_irq(¤t->sighand->siglock); cgroup_enter_frozen(); schedule(); + + /* + * We could've been woken by task_work, run it to clear + * TIF_NOTIFY_SIGNAL. The caller will retry if necessary. + */ + clear_notify_signal(); + if (unlikely(task_work_pending(current))) + task_work_run(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) diff --git a/kernel/smp.c b/kernel/smp.c index f085ebcdf9e7..aaffecdad319 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -25,6 +25,7 @@ #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> +#include <linux/string_choices.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS @@ -982,8 +983,7 @@ void __init smp_init(void) num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", - num_nodes, (num_nodes > 1 ? "s" : ""), - num_cpus, (num_cpus > 1 ? "s" : "")); + num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus)); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); @@ -1119,6 +1119,7 @@ int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); + destroy_work_on_stack(&sscs.work); return sscs.ret; } diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index d06185e054ea..62e73444ffe4 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -22,6 +22,7 @@ #include "tick-internal.h" MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 20d5df631570..783f2297111b 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -155,5 +155,6 @@ static void __exit udelay_test_exit(void) module_exit(udelay_test_exit); +MODULE_DESCRIPTION("udelay test module"); MODULE_AUTHOR("David Riley <[email protected]>"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 771d1e040303..b4843099a8da 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1141,6 +1141,7 @@ void tick_broadcast_switch_to_oneshot(void) #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *bc; unsigned long flags; @@ -1148,6 +1149,28 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) bc = tick_broadcast_device.evtdev; if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* + * If the broadcast force bit of the current CPU is set, + * then the current CPU has not yet reprogrammed the local + * timer device to avoid a ping-pong race. See + * ___tick_broadcast_oneshot_control(). + * + * If the broadcast device is hrtimer based then + * programming the broadcast event below does not have any + * effect because the local clockevent device is not + * running and not programmed because the broadcast event + * is not earlier than the pending event of the local clock + * event device. As a consequence all CPUs waiting for a + * broadcast event are stuck forever. + * + * Detect this condition and reprogram the cpu local timer + * device to avoid the starvation. + */ + if (tick_check_broadcast_expired()) { + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); + tick_program_event(td->evtdev->next_event, 1); + } + /* This moves the broadcast assignment to this CPU: */ clockevents_program_event(bc, bc->next_event, 1); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 71a792cd8936..753a184c7090 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1026,10 +1026,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; - WARN_ON_ONCE(1); - printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", - basemono, ts->next_tick, dev->next_event, - hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); + WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " + "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, + dev->next_event, hrtimer_active(&ts->sched_timer), + hrtimer_get_expires(&ts->sched_timer)); } /* @@ -1385,20 +1385,6 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) return ts->idle_calls; } -/** - * tick_nohz_get_idle_calls - return the current idle calls counter value - * - * Called from the schedutil frequency scaling governor in scheduler context. - * - * Return: the current idle calls counter value for the current CPU - */ -unsigned long tick_nohz_get_idle_calls(void) -{ - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - - return ts->idle_calls; -} - static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 3e5d422dd15c..2889763165e5 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -96,4 +96,5 @@ static struct kunit_suite time_test_suite = { }; kunit_test_suite(time_test_suite); +MODULE_DESCRIPTION("time unit test suite"); MODULE_LICENSE("GPL"); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4e18db1819f8..2fa87dcfeda9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1195,6 +1195,108 @@ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) return false; } +static bool convert_clock(u64 *val, u32 numerator, u32 denominator) +{ + u64 rem, res; + + if (!numerator || !denominator) + return false; + + res = div64_u64_rem(*val, denominator, &rem) * numerator; + *val = res + div_u64(rem * numerator, denominator); + return true; +} + +static bool convert_base_to_cs(struct system_counterval_t *scv) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + u32 num, den; + + /* The timestamp was taken from the time keeper clock source */ + if (cs->id == scv->cs_id) + return true; + + /* + * Check whether cs_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != scv->cs_id) + return false; + + num = scv->use_nsecs ? cs->freq_khz : base->numerator; + den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; + + if (!convert_clock(&scv->cycles, num, den)) + return false; + + scv->cycles += base->offset; + return true; +} + +static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) +{ + struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; + struct clocksource_base *base; + + /* + * Check whether base_id matches the base clock. Prevent the compiler from + * re-evaluating @base as the clocksource might change concurrently. + */ + base = READ_ONCE(cs->base); + if (!base || base->id != base_id) + return false; + + *cycles -= base->offset; + if (!convert_clock(cycles, base->denominator, base->numerator)) + return false; + return true; +} + +static bool convert_ns_to_cs(u64 *delta) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + + if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) + return false; + + *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); + return true; +} + +/** + * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp + * @treal: CLOCK_REALTIME timestamp to convert + * @base_id: base clocksource id + * @cycles: pointer to store the converted base clock timestamp + * + * Converts a supplied, future realtime clock value to the corresponding base clock value. + * + * Return: true if the conversion is successful, false otherwise. + */ +bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u64 delta; + + do { + seq = read_seqcount_begin(&tk_core.seq); + if ((u64)treal < tk->tkr_mono.base_real) + return false; + delta = (u64)treal - tk->tkr_mono.base_real; + if (!convert_ns_to_cs(&delta)) + return false; + *cycles = tk->tkr_mono.cycle_last + delta; + if (!convert_cs_to_base(cycles, base_id)) + return false; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return true; +} +EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and @@ -1241,7 +1343,7 @@ int get_device_system_crosststamp(int (*get_time_fn) * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || - tk->tkr_mono.clock->id != system_counterval.cs_id) + !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; @@ -1307,6 +1409,30 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** + * timekeeping_clocksource_has_base - Check whether the current clocksource + * is based on given a base clock + * @id: base clocksource ID + * + * Note: The return value is a snapshot which can become invalid right + * after the function returns. + * + * Return: true if the timekeeper clocksource has a base clock with @id, + * false otherwise + */ +bool timekeeping_clocksource_has_base(enum clocksource_ids id) +{ + /* + * This is a snapshot, so no point in using the sequence + * count. Just prevent the compiler from re-evaluating @base as the + * clocksource might change concurrently. + */ + struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); + + return base ? base->id == id : false; +} +EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); + +/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -2421,6 +2547,7 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { @@ -2489,6 +2616,8 @@ int do_adjtimex(struct __kernel_timex *txc) #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function + * @phase_ts: Pointer to timespec64 structure representing phase timestamp + * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { diff --git a/kernel/torture.c b/kernel/torture.c index c72ab2d251f4..dede150aef01 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -40,6 +40,7 @@ #include <linux/sched/rt.h> #include "rcu/rcu.h" +MODULE_DESCRIPTION("Common functions for in-kernel torture tests"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3fbaecfc88c2..dfd42c28e404 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -216,8 +216,6 @@ struct worker_pool { struct worker *manager; /* L: purely informational */ struct list_head workers; /* A: attached workers */ - struct list_head dying_workers; /* A: workers about to die */ - struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ @@ -436,7 +434,7 @@ static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ -static struct workqueue_attrs *wq_update_pod_attrs_buf; +static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -447,6 +445,9 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ +/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */ +static cpumask_var_t wq_online_cpumask; + /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; @@ -1684,33 +1685,6 @@ static void __pwq_activate_work(struct pool_workqueue *pwq, __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); } -/** - * pwq_activate_work - Activate a work item if inactive - * @pwq: pool_workqueue @work belongs to - * @work: work item to activate - * - * Returns %true if activated. %false if already active. - */ -static bool pwq_activate_work(struct pool_workqueue *pwq, - struct work_struct *work) -{ - struct worker_pool *pool = pwq->pool; - struct wq_node_nr_active *nna; - - lockdep_assert_held(&pool->lock); - - if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE)) - return false; - - nna = wq_node_nr_active(pwq->wq, pool->node); - if (nna) - atomic_inc(&nna->nr); - - pwq->nr_active++; - __pwq_activate_work(pwq, work); - return true; -} - static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) { int max = READ_ONCE(nna->max); @@ -2117,7 +2091,7 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, */ pwq = get_work_pwq(work); if (pwq && pwq->pool == pool) { - unsigned long work_data; + unsigned long work_data = *work_data_bits(work); debug_work_deactivate(work); @@ -2126,13 +2100,17 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, * pwq->inactive_works since a queued barrier can't be * canceled (see the comments in insert_wq_barrier()). * - * An inactive work item cannot be grabbed directly because + * An inactive work item cannot be deleted directly because * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active - * management later on and cause stall. Make sure the work - * item is activated before grabbing. + * management later on and cause stall. Move the linked + * barrier work items to the worklist when deleting the grabbed + * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that + * it doesn't participate in nr_active management in later + * pwq_dec_nr_in_flight(). */ - pwq_activate_work(pwq, work); + if (work_data & WORK_STRUCT_INACTIVE) + move_linked_works(work, &pwq->pool->worklist, NULL); list_del_init(&work->entry); @@ -2140,7 +2118,6 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, * work->data points to pwq iff queued. Let's point to pool. As * this destroys work->data needed by the next step, stash it. */ - work_data = *work_data_bits(work); set_work_pool_and_keep_pending(work, pool->id, pool_offq_flags(pool)); @@ -2298,9 +2275,13 @@ retry: * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. + * + * For ordered workqueue, work items must be queued on the newest pwq + * for accurate order management. Guaranteed order also guarantees + * non-reentrancy. See the comments above unplug_oldest_pwq(). */ last_pool = get_work_pool(work); - if (last_pool && last_pool != pool) { + if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) { struct worker *worker; raw_spin_lock(&last_pool->lock); @@ -2710,6 +2691,27 @@ static void worker_attach_to_pool(struct worker *worker, mutex_unlock(&wq_pool_attach_mutex); } +static void unbind_worker(struct worker *worker) +{ + lockdep_assert_held(&wq_pool_attach_mutex); + + kthread_set_per_cpu(worker->task, -1); + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); + else + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); +} + + +static void detach_worker(struct worker *worker) +{ + lockdep_assert_held(&wq_pool_attach_mutex); + + unbind_worker(worker); + list_del(&worker->node); + worker->pool = NULL; +} + /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool @@ -2721,26 +2723,16 @@ static void worker_attach_to_pool(struct worker *worker, static void worker_detach_from_pool(struct worker *worker) { struct worker_pool *pool = worker->pool; - struct completion *detach_completion = NULL; /* there is one permanent BH worker per CPU which should never detach */ WARN_ON_ONCE(pool->flags & POOL_BH); mutex_lock(&wq_pool_attach_mutex); - - kthread_set_per_cpu(worker->task, -1); - list_del(&worker->node); - worker->pool = NULL; - - if (list_empty(&pool->workers) && list_empty(&pool->dying_workers)) - detach_completion = pool->detach_completion; + detach_worker(worker); mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); - - if (detach_completion) - complete(detach_completion); } static int format_worker_id(char *buf, size_t size, struct worker *worker, @@ -2844,35 +2836,22 @@ fail: return NULL; } -static void unbind_worker(struct worker *worker) +static void detach_dying_workers(struct list_head *cull_list) { - lockdep_assert_held(&wq_pool_attach_mutex); + struct worker *worker; - kthread_set_per_cpu(worker->task, -1); - if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); - else - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + list_for_each_entry(worker, cull_list, entry) + detach_worker(worker); } -static void wake_dying_workers(struct list_head *cull_list) +static void reap_dying_workers(struct list_head *cull_list) { struct worker *worker, *tmp; list_for_each_entry_safe(worker, tmp, cull_list, entry) { list_del_init(&worker->entry); - unbind_worker(worker); - /* - * If the worker was somehow already running, then it had to be - * in pool->idle_list when set_worker_dying() happened or we - * wouldn't have gotten here. - * - * Thus, the worker must either have observed the WORKER_DIE - * flag, or have set its state to TASK_IDLE. Either way, the - * below will be observed by the worker and is safe to do - * outside of pool->lock. - */ - wake_up_process(worker->task); + kthread_stop_put(worker->task); + kfree(worker); } } @@ -2906,7 +2885,9 @@ static void set_worker_dying(struct worker *worker, struct list_head *list) worker->flags |= WORKER_DIE; list_move(&worker->entry, list); - list_move(&worker->node, &pool->dying_workers); + + /* get an extra task struct reference for later kthread_stop_put() */ + get_task_struct(worker->task); } /** @@ -2965,9 +2946,9 @@ static void idle_cull_fn(struct work_struct *work) /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker - * cannot proceed beyong worker_detach_from_pool() in its self-destruct - * path. This is required as a previously-preempted worker could run after - * set_worker_dying() has happened but before wake_dying_workers() did. + * cannot proceed beyong set_pf_worker() in its self-destruct path. + * This is required as a previously-preempted worker could run after + * set_worker_dying() has happened but before detach_dying_workers() did. */ mutex_lock(&wq_pool_attach_mutex); raw_spin_lock_irq(&pool->lock); @@ -2988,8 +2969,10 @@ static void idle_cull_fn(struct work_struct *work) } raw_spin_unlock_irq(&pool->lock); - wake_dying_workers(&cull_list); + detach_dying_workers(&cull_list); mutex_unlock(&wq_pool_attach_mutex); + + reap_dying_workers(&cull_list); } static void send_mayday(struct work_struct *work) @@ -3368,9 +3351,7 @@ woke_up: set_pf_worker(false); ida_free(&pool->worker_ida, worker->id); - worker_detach_from_pool(worker); WARN_ON_ONCE(!list_empty(&worker->entry)); - kfree(worker); return 0; } @@ -4761,7 +4742,6 @@ static int init_worker_pool(struct worker_pool *pool) timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); INIT_LIST_HEAD(&pool->workers); - INIT_LIST_HEAD(&pool->dying_workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); @@ -4903,7 +4883,6 @@ static void rcu_free_pool(struct rcu_head *rcu) */ static void put_unbound_pool(struct worker_pool *pool) { - DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; LIST_HEAD(cull_list); @@ -4955,14 +4934,11 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); raw_spin_unlock_irq(&pool->lock); - wake_dying_workers(&cull_list); + detach_dying_workers(&cull_list); - if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers)) - pool->detach_completion = &detach_completion; mutex_unlock(&wq_pool_attach_mutex); - if (pool->detach_completion) - wait_for_completion(pool->detach_completion); + reap_dying_workers(&cull_list); /* shut down the timers */ del_timer_sync(&pool->idle_timer); @@ -5038,12 +5014,6 @@ fail: return NULL; } -static void rcu_free_pwq(struct rcu_head *rcu) -{ - kmem_cache_free(pwq_cache, - container_of(rcu, struct pool_workqueue, rcu)); -} - /* * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero * refcnt and needs to be destroyed. @@ -5089,7 +5059,7 @@ static void pwq_release_workfn(struct kthread_work *work) raw_spin_unlock_irq(&nna->lock); } - call_rcu(&pwq->rcu, rcu_free_pwq); + kfree_rcu(pwq, rcu); /* * If we're the last pwq going away, @wq is already dead and no one @@ -5161,14 +5131,22 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } +static void apply_wqattrs_lock(void) +{ + mutex_lock(&wq_pool_mutex); +} + +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); +} + /** * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue * @cpu: the target CPU - * @cpu_going_down: if >= 0, the CPU to consider as offline * - * Calculate the cpumask a workqueue with @attrs should use on @pod. If - * @cpu_going_down is >= 0, that cpu is considered offline during calculation. + * Calculate the cpumask a workqueue with @attrs should use on @pod. * The result is stored in @attrs->__pod_cpumask. * * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled @@ -5177,29 +5155,18 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, * * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ -static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, - int cpu_going_down) +static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int pod = pt->cpu_pod[cpu]; - /* does @pod have any online CPUs @attrs wants? */ + /* calculate possible CPUs in @pod that @attrs wants */ cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); - cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask); - if (cpu_going_down >= 0) - cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask); - - if (cpumask_empty(attrs->__pod_cpumask)) { + /* does @pod have any online CPUs @attrs wants? */ + if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) { cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); return; } - - /* yeap, return possible CPUs in @pod that @attrs wants */ - cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]); - - if (cpumask_empty(attrs->__pod_cpumask)) - pr_warn_once("WARNING: workqueue cpumask: online intersect > " - "possible intersect\n"); } /* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */ @@ -5284,7 +5251,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { - wq_calc_pod_cpumask(new_attrs, cpu, -1); + wq_calc_pod_cpumask(new_attrs, cpu); ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); if (!ctx->pwq_tbl[cpu]) goto out_free; @@ -5394,15 +5361,12 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, } /** - * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug + * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug * @wq: the target workqueue - * @cpu: the CPU to update pool association for - * @hotplug_cpu: the CPU coming up or going down - * @online: whether @cpu is coming up or going down + * @cpu: the CPU to update the pwq slot for * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and - * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of - * @wq accordingly. + * %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged. * * * If pod affinity can't be adjusted due to memory allocation failure, it falls @@ -5415,10 +5379,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's * responsibility to flush the work item from CPU_DOWN_PREPARE. */ -static void wq_update_pod(struct workqueue_struct *wq, int cpu, - int hotplug_cpu, bool online) +static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu) { - int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; @@ -5432,13 +5394,13 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ - target_attrs = wq_update_pod_attrs_buf; + target_attrs = unbound_wq_update_pwq_attrs_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); /* nothing to do if the target cpumask matches the current pwq */ - wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); + wq_calc_pod_cpumask(target_attrs, cpu); if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs)) return; @@ -5472,21 +5434,25 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; + lockdep_assert_cpus_held(); + lockdep_assert_held(&wq_pool_mutex); + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); if (!wq->cpu_pwq) goto enomem; if (!(wq->flags & WQ_UNBOUND)) { + struct worker_pool __percpu *pools; + + if (wq->flags & WQ_BH) + pools = bh_worker_pools; + else + pools = cpu_worker_pools; + for_each_possible_cpu(cpu) { struct pool_workqueue **pwq_p; - struct worker_pool __percpu *pools; struct worker_pool *pool; - if (wq->flags & WQ_BH) - pools = bh_worker_pools; - else - pools = cpu_worker_pools; - pool = &(per_cpu_ptr(pools, cpu)[highpri]); pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); @@ -5504,26 +5470,18 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } - cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { struct pool_workqueue *dfl_pwq; - ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); + ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ dfl_pwq = rcu_access_pointer(wq->dfl_pwq); WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node || wq->pwqs.prev != &dfl_pwq->pwqs_node), "ordering guarantee broken for workqueue %s\n", wq->name); } else { - ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); + ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); } - cpus_read_unlock(); - - /* for unbound pwq, flush the pwq_release_worker ensures that the - * pwq_release_workfn() completes before calling kfree(wq). - */ - if (ret) - kthread_flush_worker(pwq_release_worker); return ret; @@ -5561,6 +5519,8 @@ static int init_rescuer(struct workqueue_struct *wq) char id_buf[WORKER_ID_LEN]; int ret; + lockdep_assert_held(&wq_pool_mutex); + if (!(wq->flags & WQ_MEM_RECLAIM)) return 0; @@ -5585,7 +5545,7 @@ static int init_rescuer(struct workqueue_struct *wq) wq->rescuer = rescuer; if (wq->flags & WQ_UNBOUND) - kthread_bind_mask(rescuer->task, wq_unbound_cpumask); + kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq)); else kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); @@ -5733,21 +5693,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, goto err_unreg_lockdep; } - if (alloc_and_link_pwqs(wq) < 0) - goto err_free_node_nr_active; - - if (wq_online && init_rescuer(wq) < 0) - goto err_destroy; - - if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) - goto err_destroy; - /* - * wq_pool_mutex protects global freeze state and workqueues list. - * Grab it, adjust max_active and add the new @wq to workqueues - * list. + * wq_pool_mutex protects the workqueues list, allocations of PWQs, + * and the global freeze state. alloc_and_link_pwqs() also requires + * cpus_read_lock() for PWQs' affinities. */ - mutex_lock(&wq_pool_mutex); + apply_wqattrs_lock(); + + if (alloc_and_link_pwqs(wq) < 0) + goto err_unlock_free_node_nr_active; mutex_lock(&wq->mutex); wq_adjust_max_active(wq); @@ -5755,13 +5709,27 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, list_add_tail_rcu(&wq->list, &workqueues); - mutex_unlock(&wq_pool_mutex); + if (wq_online && init_rescuer(wq) < 0) + goto err_unlock_destroy; + + apply_wqattrs_unlock(); + + if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) + goto err_destroy; return wq; -err_free_node_nr_active: - if (wq->flags & WQ_UNBOUND) +err_unlock_free_node_nr_active: + apply_wqattrs_unlock(); + /* + * Failed alloc_and_link_pwqs() may leave pending pwq->release_work, + * flushing the pwq_release_worker ensures that the pwq_release_workfn() + * completes before calling kfree(wq). + */ + if (wq->flags & WQ_UNBOUND) { + kthread_flush_worker(pwq_release_worker); free_node_nr_active(wq->node_nr_active); + } err_unreg_lockdep: wq_unregister_lockdep(wq); wq_free_lockdep(wq); @@ -5769,6 +5737,8 @@ err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; +err_unlock_destroy: + apply_wqattrs_unlock(); err_destroy: destroy_workqueue(wq); return NULL; @@ -6607,6 +6577,8 @@ int workqueue_online_cpu(unsigned int cpu) mutex_lock(&wq_pool_mutex); + cpumask_set_cpu(cpu, wq_online_cpumask); + for_each_pool(pool, pi) { /* BH pools aren't affected by hotplug */ if (pool->flags & POOL_BH) @@ -6629,7 +6601,7 @@ int workqueue_online_cpu(unsigned int cpu) int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) - wq_update_pod(wq, tcpu, cpu, true); + unbound_wq_update_pwq(wq, tcpu); mutex_lock(&wq->mutex); wq_update_node_max_active(wq, -1); @@ -6653,6 +6625,9 @@ int workqueue_offline_cpu(unsigned int cpu) /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); + + cpumask_clear_cpu(cpu, wq_online_cpumask); + list_for_each_entry(wq, &workqueues, list) { struct workqueue_attrs *attrs = wq->unbound_attrs; @@ -6661,7 +6636,7 @@ int workqueue_offline_cpu(unsigned int cpu) int tcpu; for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) - wq_update_pod(wq, tcpu, cpu, false); + unbound_wq_update_pwq(wq, tcpu); mutex_lock(&wq->mutex); wq_update_node_max_active(wq, cpu); @@ -6901,9 +6876,6 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) lockdep_assert_cpus_held(); mutex_lock(&wq_pool_mutex); - /* Save the current isolated cpumask & export it via sysfs */ - cpumask_copy(wq_isolated_cpumask, exclude_cpumask); - /* * If the operation fails, it will fall back to * wq_requested_unbound_cpumask which is initially set to @@ -6915,6 +6887,10 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); + /* Save the current isolated cpumask & export it via sysfs */ + if (!ret) + cpumask_copy(wq_isolated_cpumask, exclude_cpumask); + mutex_unlock(&wq_pool_mutex); free_cpumask_var(cpumask); return ret; @@ -6948,9 +6924,8 @@ static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) wq_affn_dfl = affn; list_for_each_entry(wq, &workqueues, list) { - for_each_online_cpu(cpu) { - wq_update_pod(wq, cpu, cpu, true); - } + for_each_online_cpu(cpu) + unbound_wq_update_pwq(wq, cpu); } mutex_unlock(&wq_pool_mutex); @@ -7038,19 +7013,6 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); -static void apply_wqattrs_lock(void) -{ - /* CPUs should stay stable across pwq creations and installations */ - cpus_read_lock(); - mutex_lock(&wq_pool_mutex); -} - -static void apply_wqattrs_unlock(void) -{ - mutex_unlock(&wq_pool_mutex); - cpus_read_unlock(); -} - static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -7249,16 +7211,12 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { + ret = 0; apply_wqattrs_lock(); - cpumask_copy(wq_requested_unbound_cpumask, cpumask); - if (cpumask_equal(cpumask, wq_unbound_cpumask)) { - ret = 0; - goto out_unlock; - } - - ret = workqueue_apply_unbound_cpumask(cpumask); - -out_unlock: + if (!cpumask_equal(cpumask, wq_unbound_cpumask)) + ret = workqueue_apply_unbound_cpumask(cpumask); + if (!ret) + cpumask_copy(wq_requested_unbound_cpumask, cpumask); apply_wqattrs_unlock(); } @@ -7577,10 +7535,18 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) notrace void wq_watchdog_touch(int cpu) { + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + unsigned long touch_ts = READ_ONCE(wq_watchdog_touched); + unsigned long now = jiffies; + if (cpu >= 0) - per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + per_cpu(wq_watchdog_touched_cpu, cpu) = now; + else + WARN_ONCE(1, "%s should be called with valid CPU", __func__); - wq_watchdog_touched = jiffies; + /* Don't unnecessarily store to global cacheline */ + if (time_after(now, touch_ts + thresh / 4)) + WRITE_ONCE(wq_watchdog_touched, jiffies); } static void wq_watchdog_set_thresh(unsigned long thresh) @@ -7690,10 +7656,12 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); + cpumask_copy(wq_online_cpumask, cpu_online_mask); cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); @@ -7704,8 +7672,8 @@ void __init workqueue_init_early(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); - wq_update_pod_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_pod_attrs_buf); + unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!unbound_wq_update_pwq_attrs_buf); /* * If nohz_full is enabled, set power efficient workqueue as unbound. @@ -7970,12 +7938,12 @@ void __init workqueue_init_topology(void) /* * Workqueues allocated earlier would have all CPUs sharing the default - * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU - * combinations to apply per-pod sharing. + * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue + * and CPU combinations to apply per-pod sharing. */ list_for_each_entry(wq, &workqueues, list) { for_each_online_cpu(cpu) - wq_update_pod(wq, cpu, cpu, true); + unbound_wq_update_pwq(wq, cpu); if (wq->flags & WQ_UNBOUND) { mutex_lock(&wq->mutex); wq_update_node_max_active(wq, -1); diff --git a/lib/Makefile b/lib/Makefile index 3b1769045651..30337431d10e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -426,3 +426,7 @@ $(obj)/$(TEST_FORTIFY_LOG): $(addprefix $(obj)/, $(TEST_FORTIFY_LOGS)) FORCE ifeq ($(CONFIG_FORTIFY_SOURCE),y) $(obj)/string.o: $(obj)/$(TEST_FORTIFY_LOG) endif + +# Some architectures define __NO_FORTIFY if __SANITIZE_ADDRESS__ is undefined. +# Pass CFLAGS_KASAN to avoid warnings. +$(foreach x, $(patsubst %.log,%.o,$(TEST_FORTIFY_LOGS)), $(eval KASAN_SANITIZE_$(x) := y)) diff --git a/lib/build_OID_registry b/lib/build_OID_registry index 56d8bafeb848..8267e8d71338 100755 --- a/lib/build_OID_registry +++ b/lib/build_OID_registry @@ -38,7 +38,9 @@ close IN_FILE || die; # open C_FILE, ">$ARGV[1]" or die; print C_FILE "/*\n"; -print C_FILE " * Automatically generated by ", $0 =~ s#^\Q$abs_srctree/\E##r, ". Do not edit\n"; +my $scriptname = $0; +$scriptname =~ s#^\Q$abs_srctree/\E##; +print C_FILE " * Automatically generated by ", $scriptname, ". Do not edit\n"; print C_FILE " */\n"; # diff --git a/lib/closure.c b/lib/closure.c index c971216d9d77..116afae2eed9 100644 --- a/lib/closure.c +++ b/lib/closure.c @@ -244,6 +244,9 @@ void closure_debug_destroy(struct closure *cl) { unsigned long flags; + if (cl->magic == CLOSURE_MAGIC_STACK) + return; + BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); cl->magic = CLOSURE_MAGIC_DEAD; diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fb12a9bacd2f..7cea91e193a8 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -78,16 +78,17 @@ static bool obj_freeing; /* The number of objs on the global free list */ static int obj_nr_tofree; -static int debug_objects_maxchain __read_mostly; -static int __maybe_unused debug_objects_maxchecked __read_mostly; -static int debug_objects_fixups __read_mostly; -static int debug_objects_warnings __read_mostly; -static int debug_objects_enabled __read_mostly - = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT; -static int debug_objects_pool_size __read_mostly - = ODEBUG_POOL_SIZE; -static int debug_objects_pool_min_level __read_mostly - = ODEBUG_POOL_MIN_LEVEL; +static int __data_racy debug_objects_maxchain __read_mostly; +static int __data_racy __maybe_unused debug_objects_maxchecked __read_mostly; +static int __data_racy debug_objects_fixups __read_mostly; +static int __data_racy debug_objects_warnings __read_mostly; +static int __data_racy debug_objects_enabled __read_mostly + = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT; +static int __data_racy debug_objects_pool_size __read_mostly + = ODEBUG_POOL_SIZE; +static int __data_racy debug_objects_pool_min_level __read_mostly + = ODEBUG_POOL_MIN_LEVEL; + static const struct debug_obj_descr *descr_test __read_mostly; static struct kmem_cache *obj_cache __ro_after_init; diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 899850bd6f0b..c01eaafd8041 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -140,14 +140,14 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, do { /* - * Open coded to handle VDSO_CLOCKMODE_TIMENS. Time namespace - * enabled tasks have a special VVAR page installed which - * has vd->seq set to 1 and vd->clock_mode set to - * VDSO_CLOCKMODE_TIMENS. For non time namespace affected tasks - * this does not affect performance because if vd->seq is - * odd, i.e. a concurrent update is in progress the extra - * check for vd->clock_mode is just a few extra - * instructions while spin waiting for vd->seq to become + * Open coded function vdso_read_begin() to handle + * VDSO_CLOCKMODE_TIMENS. Time namespace enabled tasks have a + * special VVAR page installed which has vd->seq set to 1 and + * vd->clock_mode set to VDSO_CLOCKMODE_TIMENS. For non time + * namespace affected tasks this does not affect performance + * because if vd->seq is odd, i.e. a concurrent update is in + * progress the extra check for vd->clock_mode is just a few + * extra instructions while spin waiting for vd->seq to become * even again. */ while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) { @@ -223,8 +223,8 @@ static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, do { /* - * Open coded to handle VDSO_CLOCK_TIMENS. See comment in - * do_hres(). + * Open coded function vdso_read_begin() to handle + * VDSO_CLOCK_TIMENS. See comment in do_hres(). */ while ((seq = READ_ONCE(vd->seq)) & 1) { if (IS_ENABLED(CONFIG_TIME_NS) && diff --git a/mm/damon/core.c b/mm/damon/core.c index 6392f1cc97a3..e66823d6b10b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1358,14 +1358,31 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, * access frequencies are similar. This is for minimizing the monitoring * overhead under the dynamically changeable access pattern. If a merge was * unnecessarily made, later 'kdamond_split_regions()' will revert it. + * + * The total number of regions could be higher than the user-defined limit, + * max_nr_regions for some cases. For example, the user can update + * max_nr_regions to a number that lower than the current number of regions + * while DAMON is running. For such a case, repeat merging until the limit is + * met while increasing @threshold up to possible maximum level. */ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, unsigned long sz_limit) { struct damon_target *t; - - damon_for_each_target(t, c) - damon_merge_regions_of(t, threshold, sz_limit); + unsigned int nr_regions; + unsigned int max_thres; + + max_thres = c->attrs.aggr_interval / + (c->attrs.sample_interval ? c->attrs.sample_interval : 1); + do { + nr_regions = 0; + damon_for_each_target(t, c) { + damon_merge_regions_of(t, threshold, sz_limit); + nr_regions += damon_nr_regions(t); + } + threshold = max(1, threshold * 2); + } while (nr_regions > c->attrs.max_nr_regions && + threshold / 2 < max_thres); } /* diff --git a/mm/filemap.c b/mm/filemap.c index 876cc64aadd7..657bcd887fdb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1847,7 +1847,7 @@ repeat: if (!folio || xa_is_value(folio)) goto out; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { @@ -2001,7 +2001,7 @@ retry: if (!folio || xa_is_value(folio)) return folio; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) goto reset; if (unlikely(folio != xas_reload(xas))) { @@ -2181,7 +2181,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (xa_is_value(folio)) goto update_start; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) @@ -2313,7 +2313,7 @@ static void filemap_get_read_batch(struct address_space *mapping, break; if (xa_is_sibling(folio)) break; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) @@ -3124,7 +3124,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ - if (vm_flags & VM_HUGEPAGE) { + if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; @@ -3231,7 +3231,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; - ptep = pte_offset_map(vmf->pmd, vmf->address); + ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; @@ -3472,7 +3473,7 @@ static struct folio *next_uptodate_folio(struct xa_state *xas, continue; if (folio_test_locked(folio)) continue; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) continue; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) @@ -4248,6 +4249,9 @@ static void filemap_cachestat(struct address_space *mapping, XA_STATE(xas, &mapping->i_pages, first_index); struct folio *folio; + /* Flush stats (and potentially sleep) outside the RCU read section. */ + mem_cgroup_flush_stats_ratelimited(NULL); + rcu_read_lock(); xas_for_each(&xas, folio, last_index) { int order; @@ -4311,7 +4315,7 @@ static void filemap_cachestat(struct address_space *mapping, goto resched; } #endif - if (workingset_test_recent(shadow, true, &workingset)) + if (workingset_test_recent(shadow, true, &workingset, false)) cs->nr_recently_evicted += nr_pages; goto resched; @@ -76,7 +76,7 @@ retry: folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; - if (unlikely(!folio_ref_try_add_rcu(folio, refs))) + if (unlikely(!folio_ref_try_add(folio, refs))) return NULL; /* @@ -97,95 +97,6 @@ retry: return folio; } -/** - * try_grab_folio() - Attempt to get or pin a folio. - * @page: pointer to page to be grabbed - * @refs: the value to (effectively) add to the folio's refcount - * @flags: gup flags: these are the FOLL_* flag values. - * - * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. - * - * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the - * same time. (That's true throughout the get_user_pages*() and - * pin_user_pages*() APIs.) Cases: - * - * FOLL_GET: folio's refcount will be incremented by @refs. - * - * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its pincount will be incremented by @refs. - * - * FOLL_PIN on single-page folios: folio's refcount will be incremented by - * @refs * GUP_PIN_COUNTING_BIAS. - * - * Return: The folio containing @page (with refcount appropriately - * incremented) for success, or NULL upon failure. If neither FOLL_GET - * nor FOLL_PIN was set, that's considered failure, and furthermore, - * a likely bug in the caller, so a warning is also emitted. - */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) -{ - struct folio *folio; - - if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) - return NULL; - - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) - return NULL; - - if (flags & FOLL_GET) - return try_get_folio(page, refs); - - /* FOLL_PIN is set */ - - /* - * Don't take a pin on the zero page - it's not going anywhere - * and it is used in a *lot* of places. - */ - if (is_zero_page(page)) - return page_folio(page); - - folio = try_get_folio(page, refs); - if (!folio) - return NULL; - - /* - * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a - * right zone, so fail and let the caller fall back to the slow - * path. - */ - if (unlikely((flags & FOLL_LONGTERM) && - !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); - return NULL; - } - - /* - * When pinning a large folio, use an exact count to track it. - * - * However, be sure to *also* increment the normal folio - * refcount field at least once, so that the folio really - * is pinned. That's why the refcount from the earlier - * try_get_folio() is left intact. - */ - if (folio_test_large(folio)) - atomic_add(refs, &folio->_pincount); - else - folio_ref_add(folio, - refs * (GUP_PIN_COUNTING_BIAS - 1)); - /* - * Adjust the pincount before re-checking the PTE for changes. - * This is essentially a smp_mb() and is paired with a memory - * barrier in folio_try_share_anon_rmap_*(). - */ - smp_mb__after_atomic(); - - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); - - return folio; -} - static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { @@ -203,58 +114,59 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) } /** - * try_grab_page() - elevate a page's refcount by a flag-dependent amount - * @page: pointer to page to be grabbed - * @flags: gup flags: these are the FOLL_* flag values. + * try_grab_folio() - add a folio's refcount by a flag-dependent amount + * @folio: pointer to folio to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: please see the try_grab_folio() documentation, with - * "refs=1". + * time. * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * - * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not * be grabbed. + * + * It is called when we have a stable reference for the folio, typically in + * GUP slow path. */ -int __must_check try_grab_page(struct page *page, unsigned int flags) +int __must_check try_grab_folio(struct folio *folio, int refs, + unsigned int flags) { - struct folio *folio = page_folio(page); - if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) return -EREMOTEIO; if (flags & FOLL_GET) - folio_ref_inc(folio); + folio_ref_add(folio, refs); else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ - if (is_zero_page(page)) + if (is_zero_folio(folio)) return 0; /* - * Similar to try_grab_folio(): be sure to *also* - * increment the normal page refcount field at least once, + * Increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { - folio_ref_add(folio, 1); - atomic_add(1, &folio->_pincount); + folio_ref_add(folio, refs); + atomic_add(refs, &folio->_pincount); } else { - folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS); } - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); } return 0; @@ -515,6 +427,102 @@ static int record_subpages(struct page *page, unsigned long sz, return nr; } + +/** + * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. + * + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the + * same time. (That's true throughout the get_user_pages*() and + * pin_user_pages*() APIs.) Cases: + * + * FOLL_GET: folio's refcount will be incremented by @refs. + * + * FOLL_PIN on large folios: folio's refcount will be incremented by + * @refs, and its pincount will be incremented by @refs. + * + * FOLL_PIN on single-page folios: folio's refcount will be incremented by + * @refs * GUP_PIN_COUNTING_BIAS. + * + * Return: The folio containing @page (with refcount appropriately + * incremented) for success, or NULL upon failure. If neither FOLL_GET + * nor FOLL_PIN was set, that's considered failure, and furthermore, + * a likely bug in the caller, so a warning is also emitted. + * + * It uses add ref unless zero to elevate the folio refcount and must be called + * in fast path only. + */ +static struct folio *try_grab_folio_fast(struct page *page, int refs, + unsigned int flags) +{ + struct folio *folio; + + /* Raise warn if it is not called in fast GUP */ + VM_WARN_ON_ONCE(!irqs_disabled()); + + if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) + return NULL; + + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return NULL; + + if (flags & FOLL_GET) + return try_get_folio(page, refs); + + /* FOLL_PIN is set */ + + /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return page_folio(page); + + folio = try_get_folio(page, refs); + if (!folio) + return NULL; + + /* + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. + */ + if (unlikely((flags & FOLL_LONGTERM) && + !folio_is_longterm_pinnable(folio))) { + if (!put_devmap_managed_folio_refs(folio, refs)) + folio_put_refs(folio, refs); + return NULL; + } + + /* + * When pinning a large folio, use an exact count to track it. + * + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. + */ + if (folio_test_large(folio)) + atomic_add(refs, &folio->_pincount); + else + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in folio_try_share_anon_rmap_*(). + */ + smp_mb__after_atomic(); + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + + return folio; +} #endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */ #ifdef CONFIG_ARCH_HAS_HUGEPD @@ -535,7 +543,7 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, */ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { unsigned long pte_end; struct page *page; @@ -558,9 +566,15 @@ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz page = pte_page(pte); refs = record_subpages(page, sz, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); - if (!folio) - return 0; + if (fast) { + folio = try_grab_folio_fast(page, refs, flags); + if (!folio) + return 0; + } else { + folio = page_folio(page); + if (try_grab_folio(folio, refs, flags)) + return 0; + } if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); @@ -588,7 +602,7 @@ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); @@ -598,7 +612,8 @@ static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); - ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr); + ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr, + fast); if (ret != 1) return ret; } while (ptep++, addr = next, addr != end); @@ -625,7 +640,7 @@ static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, ptep = hugepte_offset(hugepd, addr, pdshift); ptl = huge_pte_lock(h, vma->vm_mm, ptep); ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE, - flags, &page, &nr); + flags, &page, &nr, false); spin_unlock(ptl); if (ret == 1) { @@ -642,7 +657,7 @@ static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, - struct page **pages, int *nr) + struct page **pages, int *nr, bool fast) { return 0; } @@ -729,7 +744,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); else @@ -806,7 +821,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) return ERR_PTR(ret); @@ -968,8 +983,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ - ret = try_grab_page(page, flags); + /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ + ret = try_grab_folio(page_folio(page), 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; @@ -1233,7 +1248,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(entry); } - ret = try_grab_page(*page, gup_flags); + ret = try_grab_folio(page_folio(*page), 1, gup_flags); if (unlikely(ret)) goto unmap; out: @@ -1636,20 +1651,19 @@ next_page: * pages. */ if (page_increm > 1) { - struct folio *folio; + struct folio *folio = page_folio(page); /* * Since we already hold refcount on the * large folio, this should never fail. */ - folio = try_grab_folio(page, page_increm - 1, - foll_flags); - if (WARN_ON_ONCE(!folio)) { + if (try_grab_folio(folio, page_increm - 1, + foll_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ - gup_put_folio(page_folio(page), 1, + gup_put_folio(folio, 1, foll_flags); ret = -EFAULT; goto out; @@ -2797,7 +2811,6 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_GUP_FAST - /* * Used in the GUP-fast path to determine whether GUP is permitted to work on * a specific folio. @@ -2962,7 +2975,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - folio = try_grab_folio(page, 1, flags); + folio = try_grab_folio_fast(page, 1, flags); if (!folio) goto pte_unmap; @@ -3049,7 +3062,7 @@ static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, break; } - folio = try_grab_folio(page, 1, flags); + folio = try_grab_folio_fast(page, 1, flags); if (!folio) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; @@ -3138,7 +3151,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, page = pmd_page(orig); refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3182,7 +3195,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, page = pud_page(orig); refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3222,7 +3235,7 @@ static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, page = pgd_page(orig); refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - folio = try_grab_folio(page, refs, flags); + folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; @@ -3276,7 +3289,8 @@ static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, * pmd format and THP pmd format */ if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, flags, pages, nr) != 1) + PMD_SHIFT, next, flags, pages, nr, + true) != 1) return 0; } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) @@ -3306,7 +3320,8 @@ static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, flags, pages, nr) != 1) + PUD_SHIFT, next, flags, pages, nr, + true) != 1) return 0; } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, pages, nr)) @@ -3333,7 +3348,8 @@ static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, BUILD_BUG_ON(p4d_leaf(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, flags, pages, nr) != 1) + P4D_SHIFT, next, flags, pages, nr, + true) != 1) return 0; } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) @@ -3362,7 +3378,8 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, flags, pages, nr) != 1) + PGDIR_SHIFT, next, flags, pages, nr, + true) != 1) return; } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db7946a0a28c..2120f7478e55 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1331,7 +1331,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - ret = try_grab_page(page, flags); + ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f35abff8be60..43e1af868cfd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1625,13 +1625,10 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio, * folio appears as just a compound page. Otherwise, wait until after * allocating vmemmap to clear the flag. * - * A reference is held on the folio, except in the case of demote. - * * Must be called with hugetlb lock held. */ -static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus, - bool demote) +static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1645,6 +1642,7 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, list_del(&folio->lru); if (folio_test_hugetlb_freed(folio)) { + folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; } @@ -1661,33 +1659,13 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, if (!folio_test_hugetlb_vmemmap_optimized(folio)) __folio_clear_hugetlb(folio); - /* - * In the case of demote we do not ref count the page as it will soon - * be turned into a page of smaller size. - */ - if (!demote) - folio_ref_unfreeze(folio, 1); - h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } -static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) -{ - __remove_hugetlb_folio(h, folio, adjust_surplus, false); -} - -static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, - bool adjust_surplus) -{ - __remove_hugetlb_folio(h, folio, adjust_surplus, true); -} - static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { - int zeroed; int nid = folio_nid(folio); VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); @@ -1711,21 +1689,6 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, */ folio_set_hugetlb_vmemmap_optimized(folio); - /* - * This folio is about to be managed by the hugetlb allocator and - * should have no users. Drop our reference, and check for others - * just in case. - */ - zeroed = folio_put_testzero(folio); - if (unlikely(!zeroed)) - /* - * It is VERY unlikely soneone else has taken a ref - * on the folio. In this case, we simply return as - * free_huge_folio() will be called when this other ref - * is dropped. - */ - return; - arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); } @@ -1763,13 +1726,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, } /* - * Move PageHWPoison flag from head page to the raw error pages, - * which makes any healthy subpages reusable. - */ - if (unlikely(folio_test_hwpoison(folio))) - folio_clear_hugetlb_hwpoison(folio); - - /* * If vmemmap pages were allocated above, then we need to clear the * hugetlb flag under the hugetlb lock. */ @@ -1780,6 +1736,15 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, } /* + * Move PageHWPoison flag from head page to the raw error pages, + * which makes any healthy subpages reusable. + */ + if (unlikely(folio_test_hwpoison(folio))) + folio_clear_hugetlb_hwpoison(folio); + + folio_ref_unfreeze(folio, 1); + + /* * Non-gigantic pages demoted from CMA allocated gigantic pages * need to be given back to CMA in free_gigantic_folio. */ @@ -2197,6 +2162,9 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, nid = numa_mem_id(); retry: folio = __folio_alloc(gfp_mask, order, nid, nmask); + /* Ensure hugetlb folio won't have large_rmappable flag set. */ + if (folio) + folio_clear_large_rmappable(folio); if (folio && !folio_ref_freeze(folio, 1)) { folio_put(folio); @@ -3079,11 +3047,8 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); - if (new_folio) { - /* Folio has a zero ref count, but needs a ref to be freed */ - folio_ref_unfreeze(new_folio, 1); + if (new_folio) update_and_free_hugetlb_folio(h, new_folio, false); - } return ret; } @@ -3938,7 +3903,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - remove_hugetlb_folio_for_demote(h, folio, false); + remove_hugetlb_folio(h, folio, false); spin_unlock_irq(&hugetlb_lock); /* @@ -3952,7 +3917,6 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) if (rc) { /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); - folio_ref_unfreeze(folio, 1); add_hugetlb_folio(h, folio, false); return rc; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index b9a55322e52c..8193906515c6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -446,6 +446,8 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); + if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; @@ -481,6 +483,9 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { + /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ + synchronize_rcu(); + return __hugetlb_vmemmap_restore_folio(h, folio, 0); } @@ -505,6 +510,9 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, long restored = 0; long ret = 0; + /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ + synchronize_rcu(); + list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, @@ -550,6 +558,8 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); + if (!vmemmap_should_optimize_folio(h, folio)) return ret; @@ -601,6 +611,9 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); + /* avoid writes from page_ref_add_unless() while folding vmemmap */ + synchronize_rcu(); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } @@ -644,6 +657,9 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); + /* avoid writes from page_ref_add_unless() while folding vmemmap */ + synchronize_rcu(); + list_for_each_entry(folio, folio_list, lru) { int ret; diff --git a/mm/internal.h b/mm/internal.h index 6902b7dd8509..cc2c5e07fad3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1182,8 +1182,8 @@ int migrate_device_coherent_page(struct page *page); /* * mm/gup.c */ -struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -int __must_check try_grab_page(struct page *page, unsigned int flags); +int __must_check try_grab_folio(struct folio *folio, int refs, + unsigned int flags); /* * mm/huge_memory.c diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 774a97e6e2da..aab471791bd9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2000,9 +2000,9 @@ out_unlock: if (!is_shmem) { filemap_nr_thps_inc(mapping); /* - * Paired with smp_mb() in do_dentry_open() to ensure - * i_writecount is up to date and the update to nr_thps is - * visible. Ensures the page cache will be truncated if the + * Paired with the fence in do_dentry_open() -> get_write_access() + * to ensure i_writecount is up to date and the update to nr_thps + * is visible. Ensures the page cache will be truncated if the * file is opened writable. */ smp_mb(); @@ -2190,8 +2190,8 @@ rollback: if (!is_shmem && result == SCAN_COPY_MC) { filemap_nr_thps_dec(mapping); /* - * Paired with smp_mb() in do_dentry_open() to - * ensure the update to nr_thps is visible. + * Paired with the fence in do_dentry_open() -> get_write_access() + * to ensure the update to nr_thps is visible. */ smp_mb(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 71fe2a95b8bd..8f2f1bb18c9c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7823,17 +7823,6 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) /* Transfer the charge and the css ref */ commit_charge(new, memcg); - /* - * If the old folio is a large folio and is in the split queue, it needs - * to be removed from the split queue now, in case getting an incorrect - * split queue in destroy_large_folio() after the memcg of the old folio - * is cleared. - * - * In addition, the old folio is about to be freed after migration, so - * removing from the split queue a bit earlier seems reasonable. - */ - if (folio_test_large(old) && folio_test_large_rmappable(old)) - folio_undo_large_rmappable(old); old->memcg_data = 0; } diff --git a/mm/migrate.c b/mm/migrate.c index 20cb9f5f7446..a8c6f466e33a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -415,6 +415,15 @@ int folio_migrate_mapping(struct address_space *mapping, if (folio_ref_count(folio) != expected_count) return -EAGAIN; + /* Take off deferred split queue while frozen and memcg set */ + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) { + if (!folio_ref_freeze(folio, expected_count)) + return -EAGAIN; + folio_undo_large_rmappable(folio); + folio_ref_unfreeze(folio, expected_count); + } + /* No turning back from here */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; @@ -433,6 +442,10 @@ int folio_migrate_mapping(struct address_space *mapping, return -EAGAIN; } + /* Take off deferred split queue while frozen and memcg set */ + if (folio_test_large(folio) && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + /* * Now we know that no one else is looking at the folio: * no turning back from here. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 12c9297ed4a7..8a1c92090129 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -415,13 +415,20 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; - if (bg_thresh >= thresh) - bg_thresh = thresh / 2; tsk = current; if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + if (thresh > UINT_MAX) + thresh = UINT_MAX; + /* This makes sure bg_thresh is within 32-bits as well */ + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; @@ -471,7 +478,11 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) if (rt_task(tsk)) dirty += dirty / 4; - return dirty; + /* + * Dirty throttling logic assumes the limits in page units fit into + * 32-bits. This gives 16TB dirty limits max which is hopefully enough. + */ + return min_t(unsigned long, dirty, UINT_MAX); } /** @@ -508,10 +519,17 @@ static int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; + unsigned long old_bytes = dirty_background_bytes; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); - if (ret == 0 && write) + if (ret == 0 && write) { + if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) > + UINT_MAX) { + dirty_background_bytes = old_bytes; + return -ERANGE; + } dirty_background_ratio = 0; + } return ret; } @@ -537,6 +555,10 @@ static int dirty_bytes_handler(struct ctl_table *table, int write, ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) { + vm_dirty_bytes = old_bytes; + return -ERANGE; + } writeback_set_ratelimit(); vm_dirty_ratio = 0; } @@ -1660,7 +1682,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) */ dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh); dtc->wb_bg_thresh = dtc->thresh ? - div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; + div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need diff --git a/mm/readahead.c b/mm/readahead.c index c1b23989d9ca..817b2a352d78 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -503,11 +503,11 @@ void page_cache_ra_order(struct readahead_control *ractl, limit = min(limit, index + ra->size - 1); - if (new_order < MAX_PAGECACHE_ORDER) { + if (new_order < MAX_PAGECACHE_ORDER) new_order += 2; - new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); - new_order = min_t(unsigned int, new_order, ilog2(ra->size)); - } + + new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); diff --git a/mm/shmem.c b/mm/shmem.c index a8b181a63402..831b52dfd56e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -541,8 +541,9 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) +static bool __shmem_is_huge(struct inode *inode, pgoff_t index, + bool shmem_huge_force, struct mm_struct *mm, + unsigned long vm_flags) { loff_t i_size; @@ -573,6 +574,16 @@ bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, } } +bool shmem_is_huge(struct inode *inode, pgoff_t index, + bool shmem_huge_force, struct mm_struct *mm, + unsigned long vm_flags) +{ + if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) + return false; + + return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags); +} + #if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { @@ -3166,10 +3177,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct folio *folio; /* - * Good, the fallocate(2) manpage permits EINTR: we may have - * been interrupted because we are using up too much memory. + * Check for fatal signal so that we abort early in OOM + * situations. We don't want to abort in case of non-fatal + * signals as large fallocate can take noticeable time and + * e.g. periodic timers may result in fallocate constantly + * restarting. */ - if (signal_pending(current)) + if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; @@ -3903,14 +3917,14 @@ static const struct constant_table shmem_param_enums_huge[] = { }; const struct fs_parameter_spec shmem_fs_parameters[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), @@ -3970,9 +3984,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->mode = result.uint_32 & 07777; break; case Opt_uid: - kuid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(kuid)) - goto bad_value; + kuid = result.uid; /* * The requested uid must be representable in the @@ -3984,9 +3996,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->uid = kuid; break; case Opt_gid: - kgid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(kgid)) - goto bad_value; + kgid = result.gid; /* * The requested gid must be representable in the diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d0cbdd7c1e5b..e34ea860153f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2543,7 +2543,15 @@ static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); static struct xarray * addr_to_vb_xa(unsigned long addr) { - int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus(); + int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids; + + /* + * Please note, nr_cpu_ids points on a highest set + * possible bit, i.e. we never invoke cpumask_next() + * if an index points on it which is nr_cpu_ids - 1. + */ + if (!cpu_possible(index)) + index = cpumask_next(index, cpu_possible_mask); return &per_cpu(vmap_block_queue, index).vmap_blocks; } diff --git a/mm/workingset.c b/mm/workingset.c index c22adb93622a..a2b28e356e68 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -412,10 +412,12 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. + * @flush: whether to flush cgroup rstat. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ -bool workingset_test_recent(void *shadow, bool file, bool *workingset) +bool workingset_test_recent(void *shadow, bool file, bool *workingset, + bool flush) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; @@ -467,10 +469,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) /* * Flush stats (and potentially sleep) outside the RCU read section. + * + * Note that workingset_test_recent() itself might be called in RCU read + * section (for e.g, in cachestat) - these callers need to skip flushing + * stats (via the flush argument). + * * XXX: With per-memcg flushing and thresholding, is ratelimiting * still needed here? */ - mem_cgroup_flush_stats_ratelimited(eviction_memcg); + if (flush) + mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -558,7 +566,7 @@ void workingset_refault(struct folio *folio, void *shadow) mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - if (!workingset_test_recent(shadow, file, &workingset)) + if (!workingset_test_recent(shadow, file, &workingset, true)) return; folio_set_active(folio); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 0c76dcde5361..080053a85b4d 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -899,8 +899,8 @@ static int hci_conn_hash_alloc_unset(struct hci_dev *hdev) U16_MAX, GFP_ATOMIC); } -struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, - u8 role, u16 handle) +static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, + u8 role, u16 handle) { struct hci_conn *conn; @@ -1041,7 +1041,16 @@ struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, if (unlikely(handle < 0)) return ERR_PTR(-ECONNREFUSED); - return hci_conn_add(hdev, type, dst, role, handle); + return __hci_conn_add(hdev, type, dst, role, handle); +} + +struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, + u8 role, u16 handle) +{ + if (handle > HCI_CONN_HANDLE_MAX) + return ERR_PTR(-EINVAL); + + return __hci_conn_add(hdev, type, dst, role, handle); } static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index dd3b0f501018..c644b30977bd 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -63,50 +63,6 @@ DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); -static int hci_scan_req(struct hci_request *req, unsigned long opt) -{ - __u8 scan = opt; - - BT_DBG("%s %x", req->hdev->name, scan); - - /* Inquiry and Page scans */ - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); - return 0; -} - -static int hci_auth_req(struct hci_request *req, unsigned long opt) -{ - __u8 auth = opt; - - BT_DBG("%s %x", req->hdev->name, auth); - - /* Authentication */ - hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth); - return 0; -} - -static int hci_encrypt_req(struct hci_request *req, unsigned long opt) -{ - __u8 encrypt = opt; - - BT_DBG("%s %x", req->hdev->name, encrypt); - - /* Encryption */ - hci_req_add(req, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt); - return 0; -} - -static int hci_linkpol_req(struct hci_request *req, unsigned long opt) -{ - __le16 policy = cpu_to_le16(opt); - - BT_DBG("%s %x", req->hdev->name, policy); - - /* Default link policy */ - hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy); - return 0; -} - /* Get HCI device by index. * Device is held on return. */ struct hci_dev *hci_dev_get(int index) @@ -735,6 +691,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) { struct hci_dev *hdev; struct hci_dev_req dr; + __le16 policy; int err = 0; if (copy_from_user(&dr, arg, sizeof(dr))) @@ -761,8 +718,8 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) switch (cmd) { case HCISETAUTH: - err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, + 1, &dr.dev_opt, HCI_CMD_TIMEOUT); break; case HCISETENCRYPT: @@ -773,19 +730,23 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ - err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + err = __hci_cmd_sync_status(hdev, + HCI_OP_WRITE_AUTH_ENABLE, + 1, &dr.dev_opt, + HCI_CMD_TIMEOUT); if (err) break; } - err = hci_req_sync(hdev, hci_encrypt_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_ENCRYPT_MODE, + 1, &dr.dev_opt, + HCI_CMD_TIMEOUT); break; case HCISETSCAN: - err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, + 1, &dr.dev_opt, + HCI_CMD_TIMEOUT); /* Ensure that the connectable and discoverable states * get correctly modified as this was a non-mgmt change. @@ -795,8 +756,11 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) break; case HCISETLINKPOL: - err = hci_req_sync(hdev, hci_linkpol_req, dr.dev_opt, - HCI_INIT_TIMEOUT, NULL); + policy = cpu_to_le16(dr.dev_opt); + + err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, + 2, &policy, + HCI_CMD_TIMEOUT); break; case HCISETLINKMODE: @@ -2751,7 +2715,11 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + cancel_work_sync(&hdev->rx_work); + cancel_work_sync(&hdev->cmd_work); + cancel_work_sync(&hdev->tx_work); cancel_work_sync(&hdev->power_on); + cancel_work_sync(&hdev->error_reset); hci_cmd_sync_clear(hdev); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a487f9df8145..93f7ac905cec 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6311,6 +6311,13 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK; legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); + + if (test_bit(HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, + &hdev->quirks)) { + info->primary_phy &= 0x1f; + info->secondary_phy &= 0x1f; + } + if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &info->bdaddr, info->bdaddr_type, NULL, 0, @@ -6660,6 +6667,7 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, struct bt_iso_qos *qos; bool pending = false; u16 handle = __le16_to_cpu(ev->handle); + u32 c_sdu_interval, p_sdu_interval; bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -6684,12 +6692,25 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, pending = test_and_clear_bit(HCI_CONN_CREATE_CIS, &conn->flags); - /* Convert ISO Interval (1.25 ms slots) to SDU Interval (us) */ - qos->ucast.in.interval = le16_to_cpu(ev->interval) * 1250; - qos->ucast.out.interval = qos->ucast.in.interval; + /* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 6, Part G + * page 3075: + * Transport_Latency_C_To_P = CIG_Sync_Delay + (FT_C_To_P) × + * ISO_Interval + SDU_Interval_C_To_P + * ... + * SDU_Interval = (CIG_Sync_Delay + (FT) x ISO_Interval) - + * Transport_Latency + */ + c_sdu_interval = (get_unaligned_le24(ev->cig_sync_delay) + + (ev->c_ft * le16_to_cpu(ev->interval) * 1250)) - + get_unaligned_le24(ev->c_latency); + p_sdu_interval = (get_unaligned_le24(ev->cig_sync_delay) + + (ev->p_ft * le16_to_cpu(ev->interval) * 1250)) - + get_unaligned_le24(ev->p_latency); switch (conn->role) { case HCI_ROLE_SLAVE: + qos->ucast.in.interval = c_sdu_interval; + qos->ucast.out.interval = p_sdu_interval; /* Convert Transport Latency (us) to Latency (msec) */ qos->ucast.in.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->c_latency), @@ -6703,6 +6724,8 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, qos->ucast.out.phy = ev->p_phy; break; case HCI_ROLE_MASTER: + qos->ucast.in.interval = p_sdu_interval; + qos->ucast.out.interval = c_sdu_interval; /* Convert Transport Latency (us) to Latency (msec) */ qos->ucast.out.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->c_latency), @@ -6893,6 +6916,10 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bis = hci_conn_hash_lookup_handle(hdev, handle); if (!bis) { + if (handle > HCI_CONN_HANDLE_MAX) { + bt_dev_dbg(hdev, "ignore too large handle %u", handle); + continue; + } bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY, HCI_ROLE_SLAVE, handle); if (IS_ERR(bis)) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index a8a7d2b36870..eea34e6a236f 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -280,6 +280,19 @@ int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, } EXPORT_SYMBOL(__hci_cmd_sync_status); +int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout) +{ + int err; + + hci_req_sync_lock(hdev); + err = __hci_cmd_sync_status(hdev, opcode, plen, param, timeout); + hci_req_sync_unlock(hdev); + + return err; +} +EXPORT_SYMBOL(hci_cmd_sync_status); + static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index cc055b952ce6..398fb81f7a13 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1356,8 +1356,7 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); switch (sk->sk_state) { case BT_CONNECT2: - if (pi->conn->hcon && - test_bit(HCI_CONN_PA_SYNC, &pi->conn->hcon->flags)) { + if (test_bit(BT_SK_PA_SYNC, &pi->flags)) { iso_conn_big_sync(sk); sk->sk_state = BT_LISTEN; } else { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index aed025734d04..c3c26bbb5dda 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6761,6 +6761,8 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, BT_DBG("chan %p, len %d", chan, skb->len); + l2cap_chan_lock(chan); + if (chan->state != BT_BOUND && chan->state != BT_CONNECTED) goto drop; @@ -6777,6 +6779,7 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, } drop: + l2cap_chan_unlock(chan); l2cap_chan_put(chan); free_skb: kfree_skb(skb); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 6db60946c627..ba437c6f6ee5 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1239,6 +1239,10 @@ static void l2cap_sock_kill(struct sock *sk) BT_DBG("sk %p state %s", sk, state_to_string(sk->sk_state)); + /* Sock is dead, so set chan data to NULL, avoid other task use invalid + * sock pointer. + */ + l2cap_pi(sk)->chan->data = NULL; /* Kill poor orphan */ l2cap_chan_put(l2cap_pi(sk)->chan); @@ -1481,12 +1485,16 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { - struct sock *sk = chan->data; - struct l2cap_pinfo *pi = l2cap_pi(sk); + struct sock *sk; + struct l2cap_pinfo *pi; int err; - lock_sock(sk); + sk = chan->data; + if (!sk) + return -ENXIO; + pi = l2cap_pi(sk); + lock_sock(sk); if (chan->mode == L2CAP_MODE_ERTM && !list_empty(&pi->rx_busy)) { err = -ENOMEM; goto done; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 1daf95e17d67..3a5bd1cd1e99 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -429,7 +429,10 @@ static int is_out(const struct crush_map *map, /** * crush_choose_firstn - choose numrep distinct items of given type * @map: the crush_map + * @work: working space initialized by crush_init_workspace() * @bucket: the bucket we are choose an item from + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector * @x: crush input value * @numrep: the number of items to choose * @type: the type of item to choose @@ -445,6 +448,7 @@ static int is_out(const struct crush_map *map, * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) * @parent_r: r value passed from the parent + * @choose_args: weights and ids for each known bucket */ static int crush_choose_firstn(const struct crush_map *map, struct crush_work *work, @@ -636,9 +640,8 @@ reject: } -/** +/* * crush_choose_indep: alternative breadth-first positionally stable mapping - * */ static void crush_choose_indep(const struct crush_map *map, struct crush_work *work, diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index f263f7e91a21..ab66b599ac47 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1085,13 +1085,19 @@ static void delayed_work(struct work_struct *work) struct ceph_mon_client *monc = container_of(work, struct ceph_mon_client, delayed_work.work); - dout("monc delayed_work\n"); mutex_lock(&monc->mutex); + dout("%s mon%d\n", __func__, monc->cur_mon); + if (monc->cur_mon < 0) { + goto out; + } + if (monc->hunting) { dout("%s continuing hunt\n", __func__); reopen_session(monc); } else { int is_auth = ceph_auth_is_authenticated(monc->auth); + + dout("%s is_authed %d\n", __func__, is_auth); if (ceph_con_keepalive_expired(&monc->con, CEPH_MONC_PING_TIMEOUT)) { dout("monc keepalive timeout\n"); @@ -1116,6 +1122,8 @@ static void delayed_work(struct work_struct *work) } } __schedule_delayed(monc); + +out: mutex_unlock(&monc->mutex); } @@ -1232,13 +1240,15 @@ EXPORT_SYMBOL(ceph_monc_init); void ceph_monc_stop(struct ceph_mon_client *monc) { dout("stop\n"); - cancel_delayed_work_sync(&monc->delayed_work); mutex_lock(&monc->mutex); __close_session(monc); + monc->hunting = false; monc->cur_mon = -1; mutex_unlock(&monc->mutex); + cancel_delayed_work_sync(&monc->delayed_work); + /* * flush msgr queue before we destroy ourselves to ensure that: * - any work that references our embedded con is finished. diff --git a/net/core/datagram.c b/net/core/datagram.c index e614cfd8e14a..e72dd78471a6 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -416,15 +416,23 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; if (copy > len) copy = len; - n = INDIRECT_CALL_1(cb, simple_copy_to_iter, - vaddr + skb_frag_off(frag) + offset - start, - copy, data, to); - kunmap(page); + + n = 0; + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); + n += INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + p_off, p_len, data, to); + kunmap_local(vaddr); + } + offset += n; if (n != copy) goto short_copy; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index fd20aae30be2..bbf40b999713 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -434,7 +434,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, page = sg_page(sge); if (copied + copy > len) copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (copy) + copy = copy_page_to_iter(page, sge->offset, copy, iter); if (!copy) { copied = copied ? copied : -EFAULT; goto out; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index e645d751a5e8..223dcd25d88a 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1306,7 +1306,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) return -EINVAL; - if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && + if (rxfh.input_xfrm != RXH_XFRM_NO_CHANGE && + (rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && !ops->cap_rss_sym_xor_supported) return -EOPNOTSUPP; diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index b2de2108b356..34d76e87847d 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -37,6 +37,8 @@ static int linkstate_get_sqi(struct net_device *dev) mutex_lock(&phydev->lock); if (!phydev->drv || !phydev->drv->get_sqi) ret = -EOPNOTSUPP; + else if (!phydev->link) + ret = -ENETDOWN; else ret = phydev->drv->get_sqi(phydev); mutex_unlock(&phydev->lock); @@ -55,6 +57,8 @@ static int linkstate_get_sqi_max(struct net_device *dev) mutex_lock(&phydev->lock); if (!phydev->drv || !phydev->drv->get_sqi_max) ret = -EOPNOTSUPP; + else if (!phydev->link) + ret = -ENETDOWN; else ret = phydev->drv->get_sqi_max(phydev); mutex_unlock(&phydev->lock); @@ -62,6 +66,17 @@ static int linkstate_get_sqi_max(struct net_device *dev) return ret; }; +static bool linkstate_sqi_critical_error(int sqi) +{ + return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN; +} + +static bool linkstate_sqi_valid(struct linkstate_reply_data *data) +{ + return data->sqi >= 0 && data->sqi_max >= 0 && + data->sqi <= data->sqi_max; +} + static int linkstate_get_link_ext_state(struct net_device *dev, struct linkstate_reply_data *data) { @@ -93,12 +108,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, data->link = __ethtool_get_link(dev); ret = linkstate_get_sqi(dev); - if (ret < 0 && ret != -EOPNOTSUPP) + if (linkstate_sqi_critical_error(ret)) goto out; data->sqi = ret; ret = linkstate_get_sqi_max(dev); - if (ret < 0 && ret != -EOPNOTSUPP) + if (linkstate_sqi_critical_error(ret)) goto out; data->sqi_max = ret; @@ -136,11 +151,10 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + 0; - if (data->sqi != -EOPNOTSUPP) - len += nla_total_size(sizeof(u32)); - - if (data->sqi_max != -EOPNOTSUPP) - len += nla_total_size(sizeof(u32)); + if (linkstate_sqi_valid(data)) { + len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */ + len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */ + } if (data->link_ext_state_provided) len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ @@ -164,13 +178,14 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) return -EMSGSIZE; - if (data->sqi != -EOPNOTSUPP && - nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) - return -EMSGSIZE; + if (linkstate_sqi_valid(data)) { + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) + return -EMSGSIZE; - if (data->sqi_max != -EOPNOTSUPP && - nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) - return -EMSGSIZE; + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, + data->sqi_max)) + return -EMSGSIZE; + } if (data->link_ext_state_provided) { if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7adace541fe2..9712cdb8087c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1383,6 +1383,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; + req.pad = 0; req.idiag_states = rc->idiag_states; req.id = rc->id; @@ -1398,6 +1399,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.sdiag_family = rc->idiag_family; req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; + req.pad = 0; req.idiag_states = rc->idiag_states; req.id = rc->id; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2e39cb881e20..38da23f991d6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2129,8 +2129,16 @@ void tcp_clear_retrans(struct tcp_sock *tp) static inline void tcp_init_undo(struct tcp_sock *tp) { tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ - tp->undo_retrans = tp->retrans_out ? : -1; + /* First, account for regular retransmits in flight: */ + tp->undo_retrans = tp->retrans_out; + /* Next, account for TLP retransmits in flight: */ + if (tp->tlp_high_seq && tp->tlp_retrans) + tp->undo_retrans++; + /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */ + if (!tp->undo_retrans) + tp->undo_retrans = -1; } static bool tcp_is_rack(const struct sock *sk) @@ -2209,6 +2217,7 @@ void tcp_enter_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; + tp->tlp_high_seq = 0; tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous @@ -3077,7 +3086,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, return; if (tcp_try_undo_dsack(sk)) - tcp_try_keep_open(sk); + tcp_try_to_open(sk, flag); tcp_identify_packet_loss(sk, ack_flag); if (icsk->icsk_ca_state != TCP_CA_Recovery) { @@ -4224,6 +4233,13 @@ void tcp_parse_options(const struct net *net, */ break; #endif +#ifdef CONFIG_TCP_AO + case TCPOPT_AO: + /* TCP AO has already been checked + * (see tcp_inbound_ao_hash()). + */ + break; +#endif case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index e93df98de3f4..b01eb6d94413 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -619,6 +619,7 @@ static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, + [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, }, /* Following attributes are not received for GET/DEL, * we keep them for reference */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5bfd76a31af6..892c86657fbc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -483,15 +483,26 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct sk_buff *skb, u32 rtx_delta) { + const struct inet_connection_sock *icsk = inet_csk(sk); + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); const struct tcp_sock *tp = tcp_sk(sk); - const int timeout = TCP_RTO_MAX * 2; + int timeout = TCP_RTO_MAX * 2; s32 rcv_delta; + if (user_timeout) { + /* If user application specified a TCP_USER_TIMEOUT, + * it does not want win 0 packets to 'reset the timer' + * while retransmits are not making progress. + */ + if (rtx_delta > user_timeout) + return true; + timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout)); + } /* Note: timer interrupt might have been delayed by at least one jiffy, * and tp->rcv_tstamp might very well have been written recently. * rcv_delta can thus be negative. */ - rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; + rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; @@ -536,8 +547,6 @@ void tcp_retransmit_timer(struct sock *sk) if (WARN_ON_ONCE(!skb)) return; - tp->tlp_high_seq = 0; - if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 189c9113fe9a..578668878a85 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -326,6 +326,8 @@ found: goto fail_unlock; } + sock_set_flag(sk, SOCK_RCU_FREE); + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -342,7 +344,7 @@ found: hslot2->count++; spin_unlock(&hslot2->lock); } - sock_set_flag(sk, SOCK_RCU_FREE); + error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 1132dea0e290..0965ad11ec74 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -423,6 +423,7 @@ u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata) BSS_CHANGED_ERP_SLOT; } +/* context: requires softirqs disabled */ void ieee80211_handle_queued_frames(struct ieee80211_local *local) { struct sk_buff *skb; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 963ed75deb76..771c05640aa3 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1567,7 +1567,9 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, void ieee80211_stop_device(struct ieee80211_local *local) { + local_bh_disable(); ieee80211_handle_queued_frames(local); + local_bh_enable(); ieee80211_led_radio(local, false); ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO); diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 9ab7396668d2..21b7c3b280b4 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -161,8 +161,10 @@ void ieee802154_configure_durations(struct wpan_phy *phy, } phy->symbol_duration = duration; - phy->lifs_period = (IEEE802154_LIFS_PERIOD * phy->symbol_duration) / NSEC_PER_SEC; - phy->sifs_period = (IEEE802154_SIFS_PERIOD * phy->symbol_duration) / NSEC_PER_SEC; + phy->lifs_period = + (IEEE802154_LIFS_PERIOD * phy->symbol_duration) / NSEC_PER_USEC; + phy->sifs_period = + (IEEE802154_SIFS_PERIOD * phy->symbol_duration) / NSEC_PER_USEC; } EXPORT_SYMBOL(ieee802154_configure_durations); @@ -184,10 +186,10 @@ static void ieee802154_setup_wpan_phy_pib(struct wpan_phy *wpan_phy) * Should be done when all drivers sets this value. */ - wpan_phy->lifs_period = - (IEEE802154_LIFS_PERIOD * wpan_phy->symbol_duration) / 1000; - wpan_phy->sifs_period = - (IEEE802154_SIFS_PERIOD * wpan_phy->symbol_duration) / 1000; + wpan_phy->lifs_period = (IEEE802154_LIFS_PERIOD * + wpan_phy->symbol_duration) / NSEC_PER_USEC; + wpan_phy->sifs_period = (IEEE802154_SIFS_PERIOD * + wpan_phy->symbol_duration) / NSEC_PER_USEC; } int ieee802154_register_hw(struct ieee802154_hw *hw) diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 2a6f1ed763c9..6fbed5bb5c3e 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -34,8 +34,8 @@ void ieee802154_xmit_sync_worker(struct work_struct *work) if (res) goto err_tx; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + DEV_STATS_INC(dev, tx_packets); + DEV_STATS_ADD(dev, tx_bytes, skb->len); ieee802154_xmit_complete(&local->hw, skb, false); @@ -90,8 +90,8 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) if (ret) goto err_wake_netif_queue; - dev->stats.tx_packets++; - dev->stats.tx_bytes += len; + DEV_STATS_INC(dev, tx_packets); + DEV_STATS_ADD(dev, tx_bytes, len); } else { local->tx_skb = skb; queue_work(local->workqueue, &local->sync_tx_work); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e8dcf41d360d..91cc3a81ba8f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3823,6 +3823,15 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r nf_tables_rule_destroy(ctx, rule); } +/** nft_chain_validate - loop detection and hook validation + * + * @ctx: context containing call depth and base chain + * @chain: chain to validate + * + * Walk through the rules of the given chain and chase all jumps/gotos + * and set lookups until either the jump limit is hit or all reachable + * chains have been validated. + */ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; @@ -3844,6 +3853,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (!expr->ops->validate) continue; + /* This may call nft_chain_validate() recursively, + * callers that do so must increment ctx->level. + */ err = expr->ops->validate(ctx, expr, &data); if (err < 0) return err; @@ -10809,150 +10821,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain, } EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); -/* - * Loop detection - walk through the ruleset beginning at the destination chain - * of a new jump until either the source chain is reached (loop) or all - * reachable chains have been traversed. - * - * The loop check is performed whenever a new jump verdict is added to an - * expression or verdict map or a verdict map is bound to a new chain. - */ - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain); - -static int nft_check_loops(const struct nft_ctx *ctx, - const struct nft_set_ext *ext) -{ - const struct nft_data *data; - int ret; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - ret = nf_tables_check_loops(ctx, data->verdict.chain); - break; - default: - ret = 0; - break; - } - - return ret; -} - -static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_elem_priv *elem_priv) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - - if (!nft_set_elem_active(ext, iter->genmask)) - return 0; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - return nft_check_loops(ctx, ext); -} - -static int nft_set_catchall_loops(const struct nft_ctx *ctx, - struct nft_set *set) -{ - u8 genmask = nft_genmask_next(ctx->net); - struct nft_set_elem_catchall *catchall; - struct nft_set_ext *ext; - int ret = 0; - - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) - continue; - - ret = nft_check_loops(ctx, ext); - if (ret < 0) - return ret; - } - - return ret; -} - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain) -{ - const struct nft_rule *rule; - const struct nft_expr *expr, *last; - struct nft_set *set; - struct nft_set_binding *binding; - struct nft_set_iter iter; - - if (ctx->chain == chain) - return -ELOOP; - - if (fatal_signal_pending(current)) - return -EINTR; - - list_for_each_entry(rule, &chain->rules, list) { - nft_rule_for_each_expr(expr, last, rule) { - struct nft_immediate_expr *priv; - const struct nft_data *data; - int err; - - if (strcmp(expr->ops->type->name, "immediate")) - continue; - - priv = nft_expr_priv(expr); - if (priv->dreg != NFT_REG_VERDICT) - continue; - - data = &priv->data; - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - err = nf_tables_check_loops(ctx, - data->verdict.chain); - if (err < 0) - return err; - break; - default: - break; - } - } - } - - list_for_each_entry(set, &ctx->table->sets, list) { - if (!nft_is_active_next(ctx->net, set)) - continue; - if (!(set->flags & NFT_SET_MAP) || - set->dtype != NFT_DATA_VERDICT) - continue; - - list_for_each_entry(binding, &set->bindings, list) { - if (!(binding->flags & NFT_SET_MAP) || - binding->chain != chain) - continue; - - iter.genmask = nft_genmask_next(ctx->net); - iter.type = NFT_ITER_UPDATE; - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_loop_check_setelem; - - set->ops->walk(ctx, set, &iter); - if (!iter.err) - iter.err = nft_set_catchall_loops(ctx, set); - - if (iter.err < 0) - return iter.err; - } - } - - return 0; -} - /** * nft_parse_u32_check - fetch u32 attribute and check for maximum value * @@ -11065,7 +10933,7 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, if (data != NULL && (data->verdict.code == NFT_GOTO || data->verdict.code == NFT_JUMP)) { - err = nf_tables_check_loops(ctx, data->verdict.chain); + err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; } @@ -11483,8 +11351,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nf_tables_destroy_list)) - nf_tables_trans_destroy_flush_work(); + nf_tables_trans_destroy_flush_work(); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f1c31757e496..55e28e1da66e 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -325,7 +325,7 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) hooks = nf_hook_entries_head(net, pf, entry->state.hook); i = entry->hook_index; - if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { + if (!hooks || i >= hooks->num_hook_entries) { kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); nf_queue_entry_free(entry); return; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 2a96d9c1db65..6fa3cca87d34 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1077,6 +1077,14 @@ do_nat: */ if (nf_conntrack_confirm(skb) != NF_ACCEPT) goto drop; + + /* The ct may be dropped if a clash has been resolved, + * so it's necessary to retrieve it from skb again to + * prevent UAF. + */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + skip_add = true; } if (!skip_add) diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index c2ef9dcf91d2..cc6051d4f2ef 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -91,7 +91,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); @@ -121,7 +121,7 @@ static void ingress_destroy(struct Qdisc *sch) tcf_block_put_ext(q->block, sch, &q->block_info); if (entry) { - tcx_miniq_set_active(entry, false); + tcx_miniq_dec(entry); if (!tcx_entry_is_active(entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(entry); @@ -257,7 +257,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); @@ -276,7 +276,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, false, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, false); @@ -302,7 +302,7 @@ static void clsact_destroy(struct Qdisc *sch) tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); if (ingress_entry) { - tcx_miniq_set_active(ingress_entry, false); + tcx_miniq_dec(ingress_entry); if (!tcx_entry_is_active(ingress_entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(ingress_entry); @@ -310,7 +310,7 @@ static void clsact_destroy(struct Qdisc *sch) } if (egress_entry) { - tcx_miniq_set_active(egress_entry, false); + tcx_miniq_dec(egress_entry); if (!tcx_entry_is_active(egress_entry)) { tcx_entry_update(dev, NULL, false); tcx_entry_free(egress_entry); diff --git a/net/socket.c b/net/socket.c index e416920e9399..fcbdd5bc47ac 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1822,6 +1822,20 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, return __sys_socketpair(family, type, protocol, usockvec); } +int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, + int addrlen) +{ + int err; + + err = security_socket_bind(sock, (struct sockaddr *)address, + addrlen); + if (!err) + err = READ_ONCE(sock->ops)->bind(sock, + (struct sockaddr *)address, + addrlen); + return err; +} + /* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. @@ -1839,15 +1853,8 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { err = move_addr_to_kernel(umyaddr, addrlen, &address); - if (!err) { - err = security_socket_bind(sock, - (struct sockaddr *)&address, - addrlen); - if (!err) - err = READ_ONCE(sock->ops)->bind(sock, - (struct sockaddr *) - &address, addrlen); - } + if (!err) + err = __sys_bind_socket(sock, &address, addrlen); fput_light(sock->file, fput_needed); } return err; @@ -1863,23 +1870,28 @@ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) * necessary for a listen, and if that works, we mark the socket as * ready for listening. */ +int __sys_listen_socket(struct socket *sock, int backlog) +{ + int somaxconn, err; + + somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); + if ((unsigned int)backlog > somaxconn) + backlog = somaxconn; + + err = security_socket_listen(sock, backlog); + if (!err) + err = READ_ONCE(sock->ops)->listen(sock, backlog); + return err; +} int __sys_listen(int fd, int backlog) { struct socket *sock; int err, fput_needed; - int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { - somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); - if ((unsigned int)backlog > somaxconn) - backlog = somaxconn; - - err = security_socket_listen(sock, backlog); - if (!err) - err = READ_ONCE(sock->ops)->listen(sock, backlog); - + err = __sys_listen_socket(sock, backlog); fput_light(sock->file, fput_needed); } return err; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index dfc353eea8ed..0e1691316f42 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2441,6 +2441,13 @@ static void xs_tcp_setup_socket(struct work_struct *work) transport->srcport = 0; status = -EAGAIN; break; + case -EPERM: + /* Happens, for instance, if a BPF program is preventing + * the connect. Remap the error so upper layers can better + * deal with it. + */ + status = -ECONNREFUSED; + fallthrough; case -EINVAL: /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id. diff --git a/net/unix/garbage.c b/net/unix/garbage.c index dfe94a90ece4..23efb78fe9ef 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -476,6 +476,7 @@ prev_vertex: } if (vertex->index == vertex->scc_index) { + struct unix_vertex *v; struct list_head scc; bool scc_dead = true; @@ -486,15 +487,15 @@ prev_vertex: */ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); - list_for_each_entry_reverse(vertex, &scc, scc_entry) { + list_for_each_entry_reverse(v, &scc, scc_entry) { /* Don't restart DFS from this vertex in unix_walk_scc(). */ - list_move_tail(&vertex->entry, &unix_visited_vertices); + list_move_tail(&v->entry, &unix_visited_vertices); /* Mark vertex as off-stack. */ - vertex->index = unix_vertex_grouped_index; + v->index = unix_vertex_grouped_index; if (scc_dead) - scc_dead = unix_vertex_dead(vertex); + scc_dead = unix_vertex_dead(v); } if (scc_dead) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3c0bca4238d3..72c7bf558581 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -468,6 +468,10 @@ static const struct netlink_range_validation nl80211_punct_bitmap_range = { .max = 0xffff, }; +static const struct netlink_range_validation q_range = { + .max = INT_MAX, +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -754,7 +758,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, - [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, + [NL80211_ATTR_TXQ_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &q_range), [NL80211_ATTR_HE_CAPABILITY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_he_capa, NL80211_HE_MAX_CAPABILITY_LEN), diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index ddb5644d4fd9..6deee85a29c8 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -7,6 +7,9 @@ */ #include <kunit/test.h> +#include <linux/blk_types.h> +#include <linux/blk-mq.h> +#include <linux/blkdev.h> #include <linux/errname.h> #include <linux/ethtool.h> #include <linux/jiffies.h> @@ -20,8 +23,10 @@ /* `bindgen` gets confused at certain things. */ const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN; +const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE; const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC; const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT; const gfp_t RUST_CONST_HELPER_GFP_NOWAIT = GFP_NOWAIT; const gfp_t RUST_CONST_HELPER___GFP_ZERO = __GFP_ZERO; +const blk_features_t RUST_CONST_HELPER_BLK_FEAT_ROTATIONAL = BLK_FEAT_ROTATIONAL; diff --git a/rust/helpers.c b/rust/helpers.c index 2c37a0f5d7a8..3df5217fb2ff 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -186,3 +186,19 @@ static_assert( __alignof__(size_t) == __alignof__(uintptr_t), "Rust code expects C `size_t` to match Rust `usize`" ); + +// This will soon be moved to a separate file, so no need to merge with above. +#include <linux/blk-mq.h> +#include <linux/blkdev.h> + +void *rust_helper_blk_mq_rq_to_pdu(struct request *rq) +{ + return blk_mq_rq_to_pdu(rq); +} +EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_to_pdu); + +struct request *rust_helper_blk_mq_rq_from_pdu(void *pdu) +{ + return blk_mq_rq_from_pdu(pdu); +} +EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_from_pdu); diff --git a/rust/kernel/block.rs b/rust/kernel/block.rs new file mode 100644 index 000000000000..150f710efe5b --- /dev/null +++ b/rust/kernel/block.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Types for working with the block layer. + +pub mod mq; diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs new file mode 100644 index 000000000000..fb0f393c1cea --- /dev/null +++ b/rust/kernel/block/mq.rs @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This module provides types for implementing block drivers that interface the +//! blk-mq subsystem. +//! +//! To implement a block device driver, a Rust module must do the following: +//! +//! - Implement [`Operations`] for a type `T`. +//! - Create a [`TagSet<T>`]. +//! - Create a [`GenDisk<T>`], via the [`GenDiskBuilder`]. +//! - Add the disk to the system by calling [`GenDiskBuilder::build`] passing in +//! the `TagSet` reference. +//! +//! The types available in this module that have direct C counterparts are: +//! +//! - The [`TagSet`] type that abstracts the C type `struct tag_set`. +//! - The [`GenDisk`] type that abstracts the C type `struct gendisk`. +//! - The [`Request`] type that abstracts the C type `struct request`. +//! +//! The kernel will interface with the block device driver by calling the method +//! implementations of the `Operations` trait. +//! +//! IO requests are passed to the driver as [`kernel::types::ARef<Request>`] +//! instances. The `Request` type is a wrapper around the C `struct request`. +//! The driver must mark end of processing by calling one of the +//! `Request::end`, methods. Failure to do so can lead to deadlock or timeout +//! errors. Please note that the C function `blk_mq_start_request` is implicitly +//! called when the request is queued with the driver. +//! +//! The `TagSet` is responsible for creating and maintaining a mapping between +//! `Request`s and integer ids as well as carrying a pointer to the vtable +//! generated by `Operations`. This mapping is useful for associating +//! completions from hardware with the correct `Request` instance. The `TagSet` +//! determines the maximum queue depth by setting the number of `Request` +//! instances available to the driver, and it determines the number of queues to +//! instantiate for the driver. If possible, a driver should allocate one queue +//! per core, to keep queue data local to a core. +//! +//! One `TagSet` instance can be shared between multiple `GenDisk` instances. +//! This can be useful when implementing drivers where one piece of hardware +//! with one set of IO resources are represented to the user as multiple disks. +//! +//! One significant difference between block device drivers implemented with +//! these Rust abstractions and drivers implemented in C, is that the Rust +//! drivers have to own a reference count on the `Request` type when the IO is +//! in flight. This is to ensure that the C `struct request` instances backing +//! the Rust `Request` instances are live while the Rust driver holds a +//! reference to the `Request`. In addition, the conversion of an integer tag to +//! a `Request` via the `TagSet` would not be sound without this bookkeeping. +//! +//! [`GenDisk`]: gen_disk::GenDisk +//! [`GenDisk<T>`]: gen_disk::GenDisk +//! [`GenDiskBuilder`]: gen_disk::GenDiskBuilder +//! [`GenDiskBuilder::build`]: gen_disk::GenDiskBuilder::build +//! +//! # Example +//! +//! ```rust +//! use kernel::{ +//! alloc::flags, +//! block::mq::*, +//! new_mutex, +//! prelude::*, +//! sync::{Arc, Mutex}, +//! types::{ARef, ForeignOwnable}, +//! }; +//! +//! struct MyBlkDevice; +//! +//! #[vtable] +//! impl Operations for MyBlkDevice { +//! +//! fn queue_rq(rq: ARef<Request<Self>>, _is_last: bool) -> Result { +//! Request::end_ok(rq); +//! Ok(()) +//! } +//! +//! fn commit_rqs() {} +//! } +//! +//! let tagset: Arc<TagSet<MyBlkDevice>> = +//! Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; +//! let mut disk = gen_disk::GenDiskBuilder::new() +//! .capacity_sectors(4096) +//! .build(format_args!("myblk"), tagset)?; +//! +//! # Ok::<(), kernel::error::Error>(()) +//! ``` + +pub mod gen_disk; +mod operations; +mod raw_writer; +mod request; +mod tag_set; + +pub use operations::Operations; +pub use request::Request; +pub use tag_set::TagSet; diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs new file mode 100644 index 000000000000..f548a6199847 --- /dev/null +++ b/rust/kernel/block/mq/gen_disk.rs @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Generic disk abstraction. +//! +//! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h) +//! C header: [`include/linux/blk_mq.h`](srctree/include/linux/blk_mq.h) + +use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet}; +use crate::error; +use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc}; +use core::fmt::{self, Write}; + +/// A builder for [`GenDisk`]. +/// +/// Use this struct to configure and add new [`GenDisk`] to the VFS. +pub struct GenDiskBuilder { + rotational: bool, + logical_block_size: u32, + physical_block_size: u32, + capacity_sectors: u64, +} + +impl Default for GenDiskBuilder { + fn default() -> Self { + Self { + rotational: false, + logical_block_size: bindings::PAGE_SIZE as u32, + physical_block_size: bindings::PAGE_SIZE as u32, + capacity_sectors: 0, + } + } +} + +impl GenDiskBuilder { + /// Create a new instance. + pub fn new() -> Self { + Self::default() + } + + /// Set the rotational media attribute for the device to be built. + pub fn rotational(mut self, rotational: bool) -> Self { + self.rotational = rotational; + self + } + + /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`, + /// and that it is a power of two. + fn validate_block_size(size: u32) -> Result<()> { + if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() { + Err(error::code::EINVAL) + } else { + Ok(()) + } + } + + /// Set the logical block size of the device to be built. + /// + /// This method will check that block size is a power of two and between 512 + /// and 4096. If not, an error is returned and the block size is not set. + /// + /// This is the smallest unit the storage device can address. It is + /// typically 4096 bytes. + pub fn logical_block_size(mut self, block_size: u32) -> Result<Self> { + Self::validate_block_size(block_size)?; + self.logical_block_size = block_size; + Ok(self) + } + + /// Set the physical block size of the device to be built. + /// + /// This method will check that block size is a power of two and between 512 + /// and 4096. If not, an error is returned and the block size is not set. + /// + /// This is the smallest unit a physical storage device can write + /// atomically. It is usually the same as the logical block size but may be + /// bigger. One example is SATA drives with 4096 byte physical block size + /// that expose a 512 byte logical block size to the operating system. + pub fn physical_block_size(mut self, block_size: u32) -> Result<Self> { + Self::validate_block_size(block_size)?; + self.physical_block_size = block_size; + Ok(self) + } + + /// Set the capacity of the device to be built, in sectors (512 bytes). + pub fn capacity_sectors(mut self, capacity: u64) -> Self { + self.capacity_sectors = capacity; + self + } + + /// Build a new `GenDisk` and add it to the VFS. + pub fn build<T: Operations>( + self, + name: fmt::Arguments<'_>, + tagset: Arc<TagSet<T>>, + ) -> Result<GenDisk<T>> { + let lock_class_key = crate::sync::LockClassKey::new(); + + // SAFETY: `bindings::queue_limits` contain only fields that are valid when zeroed. + let mut lim: bindings::queue_limits = unsafe { core::mem::zeroed() }; + + lim.logical_block_size = self.logical_block_size; + lim.physical_block_size = self.physical_block_size; + if self.rotational { + lim.features = bindings::BLK_FEAT_ROTATIONAL; + } + + // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set + let gendisk = from_err_ptr(unsafe { + bindings::__blk_mq_alloc_disk( + tagset.raw_tag_set(), + &mut lim, + core::ptr::null_mut(), + lock_class_key.as_ptr(), + ) + })?; + + const TABLE: bindings::block_device_operations = bindings::block_device_operations { + submit_bio: None, + open: None, + release: None, + ioctl: None, + compat_ioctl: None, + check_events: None, + unlock_native_capacity: None, + getgeo: None, + set_read_only: None, + swap_slot_free_notify: None, + report_zones: None, + devnode: None, + alternative_gpt_sector: None, + get_unique_id: None, + // TODO: Set to THIS_MODULE. Waiting for const_refs_to_static feature to + // be merged (unstable in rustc 1.78 which is staged for linux 6.10) + // https://github.com/rust-lang/rust/issues/119618 + owner: core::ptr::null_mut(), + pr_ops: core::ptr::null_mut(), + free_disk: None, + poll_bio: None, + }; + + // SAFETY: `gendisk` is a valid pointer as we initialized it above + unsafe { (*gendisk).fops = &TABLE }; + + let mut raw_writer = RawWriter::from_array( + // SAFETY: `gendisk` points to a valid and initialized instance. We + // have exclusive access, since the disk is not added to the VFS + // yet. + unsafe { &mut (*gendisk).disk_name }, + )?; + raw_writer.write_fmt(name)?; + raw_writer.write_char('\0')?; + + // SAFETY: `gendisk` points to a valid and initialized instance of + // `struct gendisk`. `set_capacity` takes a lock to synchronize this + // operation, so we will not race. + unsafe { bindings::set_capacity(gendisk, self.capacity_sectors) }; + + crate::error::to_result( + // SAFETY: `gendisk` points to a valid and initialized instance of + // `struct gendisk`. + unsafe { + bindings::device_add_disk(core::ptr::null_mut(), gendisk, core::ptr::null_mut()) + }, + )?; + + // INVARIANT: `gendisk` was initialized above. + // INVARIANT: `gendisk` was added to the VFS via `device_add_disk` above. + Ok(GenDisk { + _tagset: tagset, + gendisk, + }) + } +} + +/// A generic block device. +/// +/// # Invariants +/// +/// - `gendisk` must always point to an initialized and valid `struct gendisk`. +/// - `gendisk` was added to the VFS through a call to +/// `bindings::device_add_disk`. +pub struct GenDisk<T: Operations> { + _tagset: Arc<TagSet<T>>, + gendisk: *mut bindings::gendisk, +} + +// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a +// `TagSet` It is safe to send this to other threads as long as T is Send. +unsafe impl<T: Operations + Send> Send for GenDisk<T> {} + +impl<T: Operations> Drop for GenDisk<T> { + fn drop(&mut self) { + // SAFETY: By type invariant, `self.gendisk` points to a valid and + // initialized instance of `struct gendisk`, and it was previously added + // to the VFS. + unsafe { bindings::del_gendisk(self.gendisk) }; + } +} diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs new file mode 100644 index 000000000000..9ba7fdfeb4b2 --- /dev/null +++ b/rust/kernel/block/mq/operations.rs @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This module provides an interface for blk-mq drivers to implement. +//! +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) + +use crate::{ + bindings, + block::mq::request::RequestDataWrapper, + block::mq::Request, + error::{from_result, Result}, + types::ARef, +}; +use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering}; + +/// Implement this trait to interface blk-mq as block devices. +/// +/// To implement a block device driver, implement this trait as described in the +/// [module level documentation]. The kernel will use the implementation of the +/// functions defined in this trait to interface a block device driver. Note: +/// There is no need for an exit_request() implementation, because the `drop` +/// implementation of the [`Request`] type will be invoked by automatically by +/// the C/Rust glue logic. +/// +/// [module level documentation]: kernel::block::mq +#[macros::vtable] +pub trait Operations: Sized { + /// Called by the kernel to queue a request with the driver. If `is_last` is + /// `false`, the driver is allowed to defer committing the request. + fn queue_rq(rq: ARef<Request<Self>>, is_last: bool) -> Result; + + /// Called by the kernel to indicate that queued requests should be submitted. + fn commit_rqs(); + + /// Called by the kernel to poll the device for completed requests. Only + /// used for poll queues. + fn poll() -> bool { + crate::build_error(crate::error::VTABLE_DEFAULT_ERROR) + } +} + +/// A vtable for blk-mq to interact with a block device driver. +/// +/// A `bindings::blk_mq_ops` vtable is constructed from pointers to the `extern +/// "C"` functions of this struct, exposed through the `OperationsVTable::VTABLE`. +/// +/// For general documentation of these methods, see the kernel source +/// documentation related to `struct blk_mq_operations` in +/// [`include/linux/blk-mq.h`]. +/// +/// [`include/linux/blk-mq.h`]: srctree/include/linux/blk-mq.h +pub(crate) struct OperationsVTable<T: Operations>(PhantomData<T>); + +impl<T: Operations> OperationsVTable<T> { + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// - The caller of this function must ensure that the pointee of `bd` is + /// valid for reads for the duration of this function. + /// - This function must be called for an initialized and live `hctx`. That + /// is, `Self::init_hctx_callback` was called and + /// `Self::exit_hctx_callback()` was not yet called. + /// - `(*bd).rq` must point to an initialized and live `bindings:request`. + /// That is, `Self::init_request_callback` was called but + /// `Self::exit_request_callback` was not yet called for the request. + /// - `(*bd).rq` must be owned by the driver. That is, the block layer must + /// promise to not access the request until the driver calls + /// `bindings::blk_mq_end_request` for the request. + unsafe extern "C" fn queue_rq_callback( + _hctx: *mut bindings::blk_mq_hw_ctx, + bd: *const bindings::blk_mq_queue_data, + ) -> bindings::blk_status_t { + // SAFETY: `bd.rq` is valid as required by the safety requirement for + // this function. + let request = unsafe { &*(*bd).rq.cast::<Request<T>>() }; + + // One refcount for the ARef, one for being in flight + request.wrapper_ref().refcount().store(2, Ordering::Relaxed); + + // SAFETY: + // - We own a refcount that we took above. We pass that to `ARef`. + // - By the safety requirements of this function, `request` is a valid + // `struct request` and the private data is properly initialized. + // - `rq` will be alive until `blk_mq_end_request` is called and is + // reference counted by `ARef` until then. + let rq = unsafe { Request::aref_from_raw((*bd).rq) }; + + // SAFETY: We have exclusive access and we just set the refcount above. + unsafe { Request::start_unchecked(&rq) }; + + let ret = T::queue_rq( + rq, + // SAFETY: `bd` is valid as required by the safety requirement for + // this function. + unsafe { (*bd).last }, + ); + + if let Err(e) = ret { + e.to_blk_status() + } else { + bindings::BLK_STS_OK as _ + } + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. + unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) { + T::commit_rqs() + } + + /// This function is called by the C kernel. It is not currently + /// implemented, and there is no way to exercise this code path. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. + unsafe extern "C" fn complete_callback(_rq: *mut bindings::request) {} + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. + unsafe extern "C" fn poll_callback( + _hctx: *mut bindings::blk_mq_hw_ctx, + _iob: *mut bindings::io_comp_batch, + ) -> core::ffi::c_int { + T::poll().into() + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. This + /// function may only be called once before `exit_hctx_callback` is called + /// for the same context. + unsafe extern "C" fn init_hctx_callback( + _hctx: *mut bindings::blk_mq_hw_ctx, + _tagset_data: *mut core::ffi::c_void, + _hctx_idx: core::ffi::c_uint, + ) -> core::ffi::c_int { + from_result(|| Ok(0)) + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// This function may only be called by blk-mq C infrastructure. + unsafe extern "C" fn exit_hctx_callback( + _hctx: *mut bindings::blk_mq_hw_ctx, + _hctx_idx: core::ffi::c_uint, + ) { + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// - This function may only be called by blk-mq C infrastructure. + /// - `_set` must point to an initialized `TagSet<T>`. + /// - `rq` must point to an initialized `bindings::request`. + /// - The allocation pointed to by `rq` must be at the size of `Request` + /// plus the size of `RequestDataWrapper`. + unsafe extern "C" fn init_request_callback( + _set: *mut bindings::blk_mq_tag_set, + rq: *mut bindings::request, + _hctx_idx: core::ffi::c_uint, + _numa_node: core::ffi::c_uint, + ) -> core::ffi::c_int { + from_result(|| { + // SAFETY: By the safety requirements of this function, `rq` points + // to a valid allocation. + let pdu = unsafe { Request::wrapper_ptr(rq.cast::<Request<T>>()) }; + + // SAFETY: The refcount field is allocated but not initialized, so + // it is valid for writes. + unsafe { RequestDataWrapper::refcount_ptr(pdu.as_ptr()).write(AtomicU64::new(0)) }; + + Ok(0) + }) + } + + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. + /// + /// # Safety + /// + /// - This function may only be called by blk-mq C infrastructure. + /// - `_set` must point to an initialized `TagSet<T>`. + /// - `rq` must point to an initialized and valid `Request`. + unsafe extern "C" fn exit_request_callback( + _set: *mut bindings::blk_mq_tag_set, + rq: *mut bindings::request, + _hctx_idx: core::ffi::c_uint, + ) { + // SAFETY: The tagset invariants guarantee that all requests are allocated with extra memory + // for the request data. + let pdu = unsafe { bindings::blk_mq_rq_to_pdu(rq) }.cast::<RequestDataWrapper>(); + + // SAFETY: `pdu` is valid for read and write and is properly initialised. + unsafe { core::ptr::drop_in_place(pdu) }; + } + + const VTABLE: bindings::blk_mq_ops = bindings::blk_mq_ops { + queue_rq: Some(Self::queue_rq_callback), + queue_rqs: None, + commit_rqs: Some(Self::commit_rqs_callback), + get_budget: None, + put_budget: None, + set_rq_budget_token: None, + get_rq_budget_token: None, + timeout: None, + poll: if T::HAS_POLL { + Some(Self::poll_callback) + } else { + None + }, + complete: Some(Self::complete_callback), + init_hctx: Some(Self::init_hctx_callback), + exit_hctx: Some(Self::exit_hctx_callback), + init_request: Some(Self::init_request_callback), + exit_request: Some(Self::exit_request_callback), + cleanup_rq: None, + busy: None, + map_queues: None, + #[cfg(CONFIG_BLK_DEBUG_FS)] + show_rq: None, + }; + + pub(crate) const fn build() -> &'static bindings::blk_mq_ops { + &Self::VTABLE + } +} diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs new file mode 100644 index 000000000000..9222465d670b --- /dev/null +++ b/rust/kernel/block/mq/raw_writer.rs @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::fmt::{self, Write}; + +use crate::error::Result; +use crate::prelude::EINVAL; + +/// A mutable reference to a byte buffer where a string can be written into. +/// +/// # Invariants +/// +/// `buffer` is always null terminated. +pub(crate) struct RawWriter<'a> { + buffer: &'a mut [u8], + pos: usize, +} + +impl<'a> RawWriter<'a> { + /// Create a new `RawWriter` instance. + fn new(buffer: &'a mut [u8]) -> Result<RawWriter<'a>> { + *(buffer.last_mut().ok_or(EINVAL)?) = 0; + + // INVARIANT: We null terminated the buffer above. + Ok(Self { buffer, pos: 0 }) + } + + pub(crate) fn from_array<const N: usize>( + a: &'a mut [core::ffi::c_char; N], + ) -> Result<RawWriter<'a>> { + Self::new( + // SAFETY: the buffer of `a` is valid for read and write as `u8` for + // at least `N` bytes. + unsafe { core::slice::from_raw_parts_mut(a.as_mut_ptr().cast::<u8>(), N) }, + ) + } +} + +impl Write for RawWriter<'_> { + fn write_str(&mut self, s: &str) -> fmt::Result { + let bytes = s.as_bytes(); + let len = bytes.len(); + + // We do not want to overwrite our null terminator + if self.pos + len > self.buffer.len() - 1 { + return Err(fmt::Error); + } + + // INVARIANT: We are not overwriting the last byte + self.buffer[self.pos..self.pos + len].copy_from_slice(bytes); + + self.pos += len; + + Ok(()) + } +} diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs new file mode 100644 index 000000000000..a0e22827f3f4 --- /dev/null +++ b/rust/kernel/block/mq/request.rs @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This module provides a wrapper for the C `struct request` type. +//! +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) + +use crate::{ + bindings, + block::mq::Operations, + error::Result, + types::{ARef, AlwaysRefCounted, Opaque}, +}; +use core::{ + marker::PhantomData, + ptr::{addr_of_mut, NonNull}, + sync::atomic::{AtomicU64, Ordering}, +}; + +/// A wrapper around a blk-mq `struct request`. This represents an IO request. +/// +/// # Implementation details +/// +/// There are four states for a request that the Rust bindings care about: +/// +/// A) Request is owned by block layer (refcount 0) +/// B) Request is owned by driver but with zero `ARef`s in existence +/// (refcount 1) +/// C) Request is owned by driver with exactly one `ARef` in existence +/// (refcount 2) +/// D) Request is owned by driver with more than one `ARef` in existence +/// (refcount > 2) +/// +/// +/// We need to track A and B to ensure we fail tag to request conversions for +/// requests that are not owned by the driver. +/// +/// We need to track C and D to ensure that it is safe to end the request and hand +/// back ownership to the block layer. +/// +/// The states are tracked through the private `refcount` field of +/// `RequestDataWrapper`. This structure lives in the private data area of the C +/// `struct request`. +/// +/// # Invariants +/// +/// * `self.0` is a valid `struct request` created by the C portion of the kernel. +/// * The private data area associated with this request must be an initialized +/// and valid `RequestDataWrapper<T>`. +/// * `self` is reference counted by atomic modification of +/// self.wrapper_ref().refcount(). +/// +#[repr(transparent)] +pub struct Request<T: Operations>(Opaque<bindings::request>, PhantomData<T>); + +impl<T: Operations> Request<T> { + /// Create an `ARef<Request>` from a `struct request` pointer. + /// + /// # Safety + /// + /// * The caller must own a refcount on `ptr` that is transferred to the + /// returned `ARef`. + /// * The type invariants for `Request` must hold for the pointee of `ptr`. + pub(crate) unsafe fn aref_from_raw(ptr: *mut bindings::request) -> ARef<Self> { + // INVARIANT: By the safety requirements of this function, invariants are upheld. + // SAFETY: By the safety requirement of this function, we own a + // reference count that we can pass to `ARef`. + unsafe { ARef::from_raw(NonNull::new_unchecked(ptr as *const Self as *mut Self)) } + } + + /// Notify the block layer that a request is going to be processed now. + /// + /// The block layer uses this hook to do proper initializations such as + /// starting the timeout timer. It is a requirement that block device + /// drivers call this function when starting to process a request. + /// + /// # Safety + /// + /// The caller must have exclusive ownership of `self`, that is + /// `self.wrapper_ref().refcount() == 2`. + pub(crate) unsafe fn start_unchecked(this: &ARef<Self>) { + // SAFETY: By type invariant, `self.0` is a valid `struct request` and + // we have exclusive access. + unsafe { bindings::blk_mq_start_request(this.0.get()) }; + } + + /// Try to take exclusive ownership of `this` by dropping the refcount to 0. + /// This fails if `this` is not the only `ARef` pointing to the underlying + /// `Request`. + /// + /// If the operation is successful, `Ok` is returned with a pointer to the + /// C `struct request`. If the operation fails, `this` is returned in the + /// `Err` variant. + fn try_set_end(this: ARef<Self>) -> Result<*mut bindings::request, ARef<Self>> { + // We can race with `TagSet::tag_to_rq` + if let Err(_old) = this.wrapper_ref().refcount().compare_exchange( + 2, + 0, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + return Err(this); + } + + let request_ptr = this.0.get(); + core::mem::forget(this); + + Ok(request_ptr) + } + + /// Notify the block layer that the request has been completed without errors. + /// + /// This function will return `Err` if `this` is not the only `ARef` + /// referencing the request. + pub fn end_ok(this: ARef<Self>) -> Result<(), ARef<Self>> { + let request_ptr = Self::try_set_end(this)?; + + // SAFETY: By type invariant, `this.0` was a valid `struct request`. The + // success of the call to `try_set_end` guarantees that there are no + // `ARef`s pointing to this request. Therefore it is safe to hand it + // back to the block layer. + unsafe { bindings::blk_mq_end_request(request_ptr, bindings::BLK_STS_OK as _) }; + + Ok(()) + } + + /// Return a pointer to the `RequestDataWrapper` stored in the private area + /// of the request structure. + /// + /// # Safety + /// + /// - `this` must point to a valid allocation of size at least size of + /// `Self` plus size of `RequestDataWrapper`. + pub(crate) unsafe fn wrapper_ptr(this: *mut Self) -> NonNull<RequestDataWrapper> { + let request_ptr = this.cast::<bindings::request>(); + // SAFETY: By safety requirements for this function, `this` is a + // valid allocation. + let wrapper_ptr = + unsafe { bindings::blk_mq_rq_to_pdu(request_ptr).cast::<RequestDataWrapper>() }; + // SAFETY: By C API contract, wrapper_ptr points to a valid allocation + // and is not null. + unsafe { NonNull::new_unchecked(wrapper_ptr) } + } + + /// Return a reference to the `RequestDataWrapper` stored in the private + /// area of the request structure. + pub(crate) fn wrapper_ref(&self) -> &RequestDataWrapper { + // SAFETY: By type invariant, `self.0` is a valid allocation. Further, + // the private data associated with this request is initialized and + // valid. The existence of `&self` guarantees that the private data is + // valid as a shared reference. + unsafe { Self::wrapper_ptr(self as *const Self as *mut Self).as_ref() } + } +} + +/// A wrapper around data stored in the private area of the C `struct request`. +pub(crate) struct RequestDataWrapper { + /// The Rust request refcount has the following states: + /// + /// - 0: The request is owned by C block layer. + /// - 1: The request is owned by Rust abstractions but there are no ARef references to it. + /// - 2+: There are `ARef` references to the request. + refcount: AtomicU64, +} + +impl RequestDataWrapper { + /// Return a reference to the refcount of the request that is embedding + /// `self`. + pub(crate) fn refcount(&self) -> &AtomicU64 { + &self.refcount + } + + /// Return a pointer to the refcount of the request that is embedding the + /// pointee of `this`. + /// + /// # Safety + /// + /// - `this` must point to a live allocation of at least the size of `Self`. + pub(crate) unsafe fn refcount_ptr(this: *mut Self) -> *mut AtomicU64 { + // SAFETY: Because of the safety requirements of this function, the + // field projection is safe. + unsafe { addr_of_mut!((*this).refcount) } + } +} + +// SAFETY: Exclusive access is thread-safe for `Request`. `Request` has no `&mut +// self` methods and `&self` methods that mutate `self` are internally +// synchronized. +unsafe impl<T: Operations> Send for Request<T> {} + +// SAFETY: Shared access is thread-safe for `Request`. `&self` methods that +// mutate `self` are internally synchronized` +unsafe impl<T: Operations> Sync for Request<T> {} + +/// Store the result of `op(target.load())` in target, returning new value of +/// target. +fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 { + let old = target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| Some(op(x))); + + // SAFETY: Because the operation passed to `fetch_update` above always + // return `Some`, `old` will always be `Ok`. + let old = unsafe { old.unwrap_unchecked() }; + + op(old) +} + +/// Store the result of `op(target.load)` in `target` if `target.load() != +/// pred`, returning true if the target was updated. +fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool { + target + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| { + if x == pred { + None + } else { + Some(op(x)) + } + }) + .is_ok() +} + +// SAFETY: All instances of `Request<T>` are reference counted. This +// implementation of `AlwaysRefCounted` ensure that increments to the ref count +// keeps the object alive in memory at least until a matching reference count +// decrement is executed. +unsafe impl<T: Operations> AlwaysRefCounted for Request<T> { + fn inc_ref(&self) { + let refcount = &self.wrapper_ref().refcount(); + + #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] + let updated = atomic_relaxed_op_unless(refcount, |x| x + 1, 0); + + #[cfg(CONFIG_DEBUG_MISC)] + if !updated { + panic!("Request refcount zero on clone") + } + } + + unsafe fn dec_ref(obj: core::ptr::NonNull<Self>) { + // SAFETY: The type invariants of `ARef` guarantee that `obj` is valid + // for read. + let wrapper_ptr = unsafe { Self::wrapper_ptr(obj.as_ptr()).as_ptr() }; + // SAFETY: The type invariant of `Request` guarantees that the private + // data area is initialized and valid. + let refcount = unsafe { &*RequestDataWrapper::refcount_ptr(wrapper_ptr) }; + + #[cfg_attr(not(CONFIG_DEBUG_MISC), allow(unused_variables))] + let new_refcount = atomic_relaxed_op_return(refcount, |x| x - 1); + + #[cfg(CONFIG_DEBUG_MISC)] + if new_refcount == 0 { + panic!("Request reached refcount zero in Rust abstractions"); + } + } +} diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs new file mode 100644 index 000000000000..f9a1ca655a35 --- /dev/null +++ b/rust/kernel/block/mq/tag_set.rs @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! This module provides the `TagSet` struct to wrap the C `struct blk_mq_tag_set`. +//! +//! C header: [`include/linux/blk-mq.h`](srctree/include/linux/blk-mq.h) + +use core::pin::Pin; + +use crate::{ + bindings, + block::mq::{operations::OperationsVTable, request::RequestDataWrapper, Operations}, + error, + prelude::PinInit, + try_pin_init, + types::Opaque, +}; +use core::{convert::TryInto, marker::PhantomData}; +use macros::{pin_data, pinned_drop}; + +/// A wrapper for the C `struct blk_mq_tag_set`. +/// +/// `struct blk_mq_tag_set` contains a `struct list_head` and so must be pinned. +/// +/// # Invariants +/// +/// - `inner` is initialized and valid. +#[pin_data(PinnedDrop)] +#[repr(transparent)] +pub struct TagSet<T: Operations> { + #[pin] + inner: Opaque<bindings::blk_mq_tag_set>, + _p: PhantomData<T>, +} + +impl<T: Operations> TagSet<T> { + /// Try to create a new tag set + pub fn new( + nr_hw_queues: u32, + num_tags: u32, + num_maps: u32, + ) -> impl PinInit<Self, error::Error> { + // SAFETY: `blk_mq_tag_set` only contains integers and pointers, which + // all are allowed to be 0. + let tag_set: bindings::blk_mq_tag_set = unsafe { core::mem::zeroed() }; + let tag_set = core::mem::size_of::<RequestDataWrapper>() + .try_into() + .map(|cmd_size| { + bindings::blk_mq_tag_set { + ops: OperationsVTable::<T>::build(), + nr_hw_queues, + timeout: 0, // 0 means default which is 30Hz in C + numa_node: bindings::NUMA_NO_NODE, + queue_depth: num_tags, + cmd_size, + flags: bindings::BLK_MQ_F_SHOULD_MERGE, + driver_data: core::ptr::null_mut::<core::ffi::c_void>(), + nr_maps: num_maps, + ..tag_set + } + }); + + try_pin_init!(TagSet { + inner <- PinInit::<_, error::Error>::pin_chain(Opaque::new(tag_set?), |tag_set| { + // SAFETY: we do not move out of `tag_set`. + let tag_set = unsafe { Pin::get_unchecked_mut(tag_set) }; + // SAFETY: `tag_set` is a reference to an initialized `blk_mq_tag_set`. + error::to_result( unsafe { bindings::blk_mq_alloc_tag_set(tag_set.get())}) + }), + _p: PhantomData, + }) + } + + /// Return the pointer to the wrapped `struct blk_mq_tag_set` + pub(crate) fn raw_tag_set(&self) -> *mut bindings::blk_mq_tag_set { + self.inner.get() + } +} + +#[pinned_drop] +impl<T: Operations> PinnedDrop for TagSet<T> { + fn drop(self: Pin<&mut Self>) { + // SAFETY: By type invariant `inner` is valid and has been properly + // initialized during construction. + unsafe { bindings::blk_mq_free_tag_set(self.inner.get()) }; + } +} diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs index 55280ae9fe40..145f5c397009 100644 --- a/rust/kernel/error.rs +++ b/rust/kernel/error.rs @@ -126,6 +126,12 @@ impl Error { self.0 } + #[cfg(CONFIG_BLOCK)] + pub(crate) fn to_blk_status(self) -> bindings::blk_status_t { + // SAFETY: `self.0` is a valid error due to its invariant. + unsafe { bindings::errno_to_blk_status(self.0) } + } + /// Returns the error encoded as a pointer. #[allow(dead_code)] pub(crate) fn to_ptr<T>(self) -> *mut T { diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index fbd91a48ff8b..2cf7c6b6f66b 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -27,6 +27,8 @@ compile_error!("Missing kernel configuration for conditional compilation"); extern crate self as kernel; pub mod alloc; +#[cfg(CONFIG_BLOCK)] +pub mod block; mod build_assert; pub mod error; pub mod init; diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh index a78b804b680c..b9513d224476 100755 --- a/scripts/ld-version.sh +++ b/scripts/ld-version.sh @@ -57,9 +57,11 @@ else fi fi -# Some distributions append a package release number, as in 2.34-4.fc32 -# Trim the hyphen and any characters that follow. -version=${version%-*} +# There may be something after the version, such as a distribution's package +# release number (like Fedora's "2.34-4.fc32") or punctuation (like LLD briefly +# added before the "compatible with GNU linkers" string), so remove everything +# after just numbers and periods. +version=${version%%[!0-9.]*} cversion=$(get_canonical_version $version) min_cversion=$(get_canonical_version $min_version) diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec index fffc8af8deb1..c52d517b9364 100644 --- a/scripts/package/kernel.spec +++ b/scripts/package/kernel.spec @@ -83,7 +83,6 @@ ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEA done if [ -d "%{buildroot}/lib/modules/%{KERNELRELEASE}/dtb" ];then - echo "/lib/modules/%{KERNELRELEASE}/dtb" find "%{buildroot}/lib/modules/%{KERNELRELEASE}/dtb" -printf "%%%ghost /boot/dtb-%{KERNELRELEASE}/%%P\n" fi diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index abdd22007ed8..e4a79a9b2d58 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -427,8 +427,6 @@ static void __init remove_securityfs_measurement_lists(struct dentry **lists) kfree(lists); } - - securityfs_measurement_list_count = 0; } static int __init create_securityfs_measurement_lists(void) @@ -625,6 +623,7 @@ out: securityfs_remove(binary_runtime_measurements); remove_securityfs_measurement_lists(ascii_securityfs_measurement_lists); remove_securityfs_measurement_lists(binary_securityfs_measurement_lists); + securityfs_measurement_list_count = 0; securityfs_remove(ima_symlink); securityfs_remove(ima_dir); diff --git a/sound/pci/hda/cs35l41_hda_property.c b/sound/pci/hda/cs35l41_hda_property.c index 51998d1c72ff..80c816922f78 100644 --- a/sound/pci/hda/cs35l41_hda_property.c +++ b/sound/pci/hda/cs35l41_hda_property.c @@ -128,8 +128,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "17AA38B5", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B6", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, { "17AA38B7", 2, EXTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, 0, 0 }, 0, 1, -1, 0, 0, 0 }, - { "17AA38C7", 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 2, -1, 1000, 4500, 24 }, - { "17AA38C8", 4, INTERNAL, { CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT }, 0, 2, -1, 1000, 4500, 24 }, + { "17AA38C7", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, + { "17AA38C8", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, { "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, {} diff --git a/sound/pci/hda/hda_controller.c b/sound/pci/hda/hda_controller.c index 766734dc5be2..5d86e5a9c814 100644 --- a/sound/pci/hda/hda_controller.c +++ b/sound/pci/hda/hda_controller.c @@ -463,7 +463,8 @@ static int azx_get_sync_time(ktime_t *device, *device = ktime_add_ns(*device, (wallclk_cycles * NSEC_PER_SEC) / ((HDA_MAX_CYCLE_VALUE + 1) * runtime->rate)); - *system = convert_art_to_tsc(tsc_counter); + system->cycles = tsc_counter; + system->cs_id = CSID_X86_ART; return 0; } diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 811e82474200..766f0b1d3e9d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10053,6 +10053,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x83b9, "HP Spectre x360", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x841c, "HP Pavilion 15-CK0xx", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x8497, "HP Envy x360", ALC269_FIXUP_HP_MUTE_LED_MIC3), + SND_PCI_QUIRK(0x103c, 0x84a6, "HP 250 G7 Notebook PC", ALC269_FIXUP_HP_LINE1_MIC1_LED), SND_PCI_QUIRK(0x103c, 0x84ae, "HP 15-db0403ng", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x84da, "HP OMEN dc0019-ur", ALC295_FIXUP_HP_OMEN), SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3), @@ -10383,6 +10384,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x10cf, 0x1845, "Lifebook U904", ALC269_FIXUP_LIFEBOOK_EXTMIC), SND_PCI_QUIRK(0x10ec, 0x10f2, "Intel Reference board", ALC700_FIXUP_INTEL_REFERENCE), SND_PCI_QUIRK(0x10ec, 0x118c, "Medion EE4254 MD62100", ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE), + SND_PCI_QUIRK(0x10ec, 0x11bc, "VAIO VJFE-IL", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x10ec, 0x1230, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10ec, 0x124c, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10ec, 0x1252, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), @@ -10480,6 +10482,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0xa600, "Clevo NL50NU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xa650, "Clevo NP[567]0SN[CD]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xa671, "Clevo NP70SN[CDE]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0xa763, "Clevo V54x_6x_TU", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xb018, "Clevo NP50D[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xb019, "Clevo NH77D[BE]Q", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xb022, "Clevo NH77D[DC][QW]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), @@ -10655,6 +10658,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1945, "Redmi G", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1947, "RedmiBook Air", ALC255_FIXUP_XIAOMI_HEADSET_MIC), + SND_PCI_QUIRK(0x2782, 0x0214, "VAIO VJFE-CL", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x2782, 0x0232, "CHUWI CoreBook XPro", ALC269VB_FIXUP_CHUWI_COREBOOK_XPRO), SND_PCI_QUIRK(0x2782, 0x1707, "Vaio VJFE-ADL", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x8086, 0x2074, "Intel NUC 8", ALC233_FIXUP_INTEL_NUC8_DMIC), diff --git a/sound/soc/codecs/rt711-sdw.c b/sound/soc/codecs/rt711-sdw.c index 8ca8bcd177ab..dfda6bb5c6f8 100644 --- a/sound/soc/codecs/rt711-sdw.c +++ b/sound/soc/codecs/rt711-sdw.c @@ -38,7 +38,9 @@ static bool rt711_readable_register(struct device *dev, unsigned int reg) case 0x8300 ... 0x83ff: case 0x9c00 ... 0x9cff: case 0xb900 ... 0xb9ff: + case 0x752008: case 0x752009: + case 0x75200b: case 0x752011: case 0x75201a: case 0x752045: diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c index c61d298ea6b3..1c823f9eea57 100644 --- a/sound/soc/sof/intel/hda-dai.c +++ b/sound/soc/sof/intel/hda-dai.c @@ -617,12 +617,6 @@ static int hda_dai_suspend(struct hdac_bus *bus) sdai = swidget->private; ops = sdai->platform_private; - ret = hda_link_dma_cleanup(hext_stream->link_substream, - hext_stream, - cpu_dai); - if (ret < 0) - return ret; - /* for consistency with TRIGGER_SUSPEND */ if (ops->post_trigger) { ret = ops->post_trigger(sdev, cpu_dai, @@ -631,6 +625,12 @@ static int hda_dai_suspend(struct hdac_bus *bus) if (ret < 0) return ret; } + + ret = hda_link_dma_cleanup(hext_stream->link_substream, + hext_stream, + cpu_dai); + if (ret < 0) + return ret; } } diff --git a/sound/soc/sof/intel/hda-pcm.c b/sound/soc/sof/intel/hda-pcm.c index 9fb8521b896b..f6e24edd7adb 100644 --- a/sound/soc/sof/intel/hda-pcm.c +++ b/sound/soc/sof/intel/hda-pcm.c @@ -258,6 +258,12 @@ int hda_dsp_pcm_open(struct snd_sof_dev *sdev, snd_pcm_hw_constraint_integer(substream->runtime, SNDRV_PCM_HW_PARAM_PERIODS); + /* Limit the maximum number of periods to not exceed the BDL entries count */ + if (runtime->hw.periods_max > HDA_DSP_MAX_BDL_ENTRIES) + snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_PERIODS, + runtime->hw.periods_min, + HDA_DSP_MAX_BDL_ENTRIES); + /* Only S16 and S32 supported by HDA hardware when used without DSP */ if (sdev->dspless_mode_selected) snd_pcm_hw_constraint_mask64(substream->runtime, SNDRV_PCM_HW_PARAM_FORMAT, diff --git a/tools/include/nolibc/stdint.h b/tools/include/nolibc/stdint.h index 6665e272e213..cd79ddd6170e 100644 --- a/tools/include/nolibc/stdint.h +++ b/tools/include/nolibc/stdint.h @@ -96,6 +96,10 @@ typedef uint64_t uintmax_t; #define UINT_FAST32_MAX SIZE_MAX #define UINT_FAST64_MAX UINT64_MAX +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + #ifndef INT_MIN #define INT_MIN (-__INT_MAX__ - 1) #endif @@ -110,4 +114,19 @@ typedef uint64_t uintmax_t; #define LONG_MAX __LONG_MAX__ #endif +#ifndef ULONG_MAX +#define ULONG_MAX ((unsigned long)(__LONG_MAX__) * 2 + 1) +#endif + +#ifndef LLONG_MIN +#define LLONG_MIN (-__LONG_LONG_MAX__ - 1) +#endif +#ifndef LLONG_MAX +#define LLONG_MAX __LONG_LONG_MAX__ +#endif + +#ifndef ULLONG_MAX +#define ULLONG_MAX ((unsigned long long)(__LONG_LONG_MAX__) * 2 + 1) +#endif + #endif /* _NOLIBC_STDINT_H */ diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 16cd4d807251..c968dbbc4ef8 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -376,6 +376,16 @@ int setvbuf(FILE *stream __attribute__((unused)), return 0; } +static __attribute__((unused)) +const char *strerror(int errno) +{ + static char buf[18] = "errno="; + + i64toa_r(errno, &buf[6]); + + return buf; +} + /* make sure to include all global symbols */ #include "nolibc.h" diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 5be9d3c7435a..75aa273c23a6 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -438,6 +438,115 @@ char *u64toa(uint64_t in) return itoa_buffer; } +static __attribute__((unused)) +uintmax_t __strtox(const char *nptr, char **endptr, int base, intmax_t lower_limit, uintmax_t upper_limit) +{ + const char signed_ = lower_limit != 0; + unsigned char neg = 0, overflow = 0; + uintmax_t val = 0, limit, old_val; + char c; + + if (base < 0 || base > 36) { + SET_ERRNO(EINVAL); + goto out; + } + + while (isspace(*nptr)) + nptr++; + + if (*nptr == '+') { + nptr++; + } else if (*nptr == '-') { + neg = 1; + nptr++; + } + + if (signed_ && neg) + limit = -(uintmax_t)lower_limit; + else + limit = upper_limit; + + if ((base == 0 || base == 16) && + (strncmp(nptr, "0x", 2) == 0 || strncmp(nptr, "0X", 2) == 0)) { + base = 16; + nptr += 2; + } else if (base == 0 && strncmp(nptr, "0", 1) == 0) { + base = 8; + nptr += 1; + } else if (base == 0) { + base = 10; + } + + while (*nptr) { + c = *nptr; + + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'z') + c = c - 'a' + 10; + else if (c >= 'A' && c <= 'Z') + c = c - 'A' + 10; + else + goto out; + + if (c >= base) + goto out; + + nptr++; + old_val = val; + val *= base; + val += c; + + if (val > limit || val < old_val) + overflow = 1; + } + +out: + if (overflow) { + SET_ERRNO(ERANGE); + val = limit; + } + if (endptr) + *endptr = (char *)nptr; + return neg ? -val : val; +} + +static __attribute__((unused)) +long strtol(const char *nptr, char **endptr, int base) +{ + return __strtox(nptr, endptr, base, LONG_MIN, LONG_MAX); +} + +static __attribute__((unused)) +unsigned long strtoul(const char *nptr, char **endptr, int base) +{ + return __strtox(nptr, endptr, base, 0, ULONG_MAX); +} + +static __attribute__((unused)) +long long strtoll(const char *nptr, char **endptr, int base) +{ + return __strtox(nptr, endptr, base, LLONG_MIN, LLONG_MAX); +} + +static __attribute__((unused)) +unsigned long long strtoull(const char *nptr, char **endptr, int base) +{ + return __strtox(nptr, endptr, base, 0, ULLONG_MAX); +} + +static __attribute__((unused)) +intmax_t strtoimax(const char *nptr, char **endptr, int base) +{ + return __strtox(nptr, endptr, base, INTMAX_MIN, INTMAX_MAX); +} + +static __attribute__((unused)) +uintmax_t strtoumax(const char *nptr, char **endptr, int base) +{ + return __strtox(nptr, endptr, base, 0, UINTMAX_MAX); +} + /* make sure to include all global symbols */ #include "nolibc.h" diff --git a/tools/memory-model/Documentation/README b/tools/memory-model/Documentation/README index db90a26dbdf4..304162743a5b 100644 --- a/tools/memory-model/Documentation/README +++ b/tools/memory-model/Documentation/README @@ -47,6 +47,10 @@ DESCRIPTION OF FILES README This file. +access-marking.txt + Guidelines for marking intentionally concurrent accesses to + shared memory. + cheatsheet.txt Quick-reference guide to the Linux-kernel memory model. diff --git a/tools/memory-model/Documentation/access-marking.txt b/tools/memory-model/Documentation/access-marking.txt index 65778222183e..3fbe77fd564a 100644 --- a/tools/memory-model/Documentation/access-marking.txt +++ b/tools/memory-model/Documentation/access-marking.txt @@ -6,7 +6,8 @@ normal accesses to shared memory, that is "normal" as in accesses that do not use read-modify-write atomic operations. It also describes how to document these accesses, both with comments and with special assertions processed by the Kernel Concurrency Sanitizer (KCSAN). This discussion -builds on an earlier LWN article [1]. +builds on an earlier LWN article [1] and Linux Foundation mentorship +session [2]. ACCESS-MARKING OPTIONS @@ -24,6 +25,11 @@ The Linux kernel provides the following access-marking options: 4. WRITE_ONCE(), for example, "WRITE_ONCE(a, b);" The various forms of atomic_set() also fit in here. +5. __data_racy, for example "int __data_racy a;" + +6. KCSAN's negative-marking assertions, ASSERT_EXCLUSIVE_ACCESS() + and ASSERT_EXCLUSIVE_WRITER(), are described in the + "ACCESS-DOCUMENTATION OPTIONS" section below. These may be used in combination, as shown in this admittedly improbable example: @@ -31,7 +37,7 @@ example: WRITE_ONCE(a, b + data_race(c + d) + READ_ONCE(e)); Neither plain C-language accesses nor data_race() (#1 and #2 above) place -any sort of constraint on the compiler's choice of optimizations [2]. +any sort of constraint on the compiler's choice of optimizations [3]. In contrast, READ_ONCE() and WRITE_ONCE() (#3 and #4 above) restrict the compiler's use of code-motion and common-subexpression optimizations. Therefore, if a given access is involved in an intentional data race, @@ -205,6 +211,23 @@ because doing otherwise prevents KCSAN from detecting violations of your code's synchronization rules. +Use of __data_racy +------------------ + +Adding the __data_racy type qualifier to the declaration of a variable +causes KCSAN to treat all accesses to that variable as if they were +enclosed by data_race(). However, __data_racy does not affect the +compiler, though one could imagine hardened kernel builds treating the +__data_racy type qualifier as if it was the volatile keyword. + +Note well that __data_racy is subject to the same pointer-declaration +rules as are other type qualifiers such as const and volatile. +For example: + + int __data_racy *p; // Pointer to data-racy data. + int *__data_racy p; // Data-racy pointer to non-data-racy data. + + ACCESS-DOCUMENTATION OPTIONS ============================ @@ -342,7 +365,7 @@ as follows: Because foo is read locklessly, all accesses are marked. The purpose of the ASSERT_EXCLUSIVE_WRITER() is to allow KCSAN to check for a buggy -concurrent lockless write. +concurrent write, whether marked or not. Lock-Protected Writes With Heuristic Lockless Reads @@ -594,5 +617,8 @@ REFERENCES [1] "Concurrency bugs should fear the big bad data-race detector (part 2)" https://lwn.net/Articles/816854/ -[2] "Who's afraid of a big bad optimizing compiler?" +[2] "The Kernel Concurrency Sanitizer" + https://www.linuxfoundation.org/webinars/the-kernel-concurrency-sanitizer + +[3] "Who's afraid of a big bad optimizing compiler?" https://lwn.net/Articles/793253/ diff --git a/tools/memory-model/lock.cat b/tools/memory-model/lock.cat index 53b5a492739d..03c12efed66a 100644 --- a/tools/memory-model/lock.cat +++ b/tools/memory-model/lock.cat @@ -54,6 +54,12 @@ flag ~empty LKR \ domain(lk-rmw) as unpaired-LKR *) empty ([LKW] ; po-loc ; [LKR]) \ (po-loc ; [UL] ; po-loc) as lock-nest +(* + * In the same way, spin_is_locked() inside a critical section must always + * return True (no RU events can be in a critical section for the same lock). + *) +empty ([LKW] ; po-loc ; [RU]) \ (po-loc ; [UL] ; po-loc) as nested-is-locked + (* The final value of a spinlock should not be tested *) flag ~empty [FW] ; loc ; [ALL-LOCKS] as lock-final @@ -79,42 +85,50 @@ empty ([UNMATCHED-LKW] ; loc ; [UNMATCHED-LKW]) \ id as unmatched-locks (* rfi for LF events: link each LKW to the LF events in its critical section *) let rfi-lf = ([LKW] ; po-loc ; [LF]) \ ([LKW] ; po-loc ; [UL] ; po-loc) -(* rfe for LF events *) +(* Utility macro to convert a single pair to a single-edge relation *) +let pair-to-relation p = p ++ 0 + +(* + * If a given LF event e is outside a critical section, it cannot read + * internally but it may read from an LKW event in another thread. + * Compute the relation containing these possible edges. + *) +let possible-rfe-noncrit-lf e = (LKW * {e}) & loc & ext + +(* Compute set of sets of possible rfe edges for LF events *) let all-possible-rfe-lf = (* - * Given an LF event r, compute the possible rfe edges for that event - * (all those starting from LKW events in other threads), - * and then convert that relation to a set of single-edge relations. + * Convert the possible-rfe-noncrit-lf relation for e + * to a set of single edges *) - let possible-rfe-lf r = - let pair-to-relation p = p ++ 0 - in map pair-to-relation ((LKW * {r}) & loc & ext) - (* Do this for each LF event r that isn't in rfi-lf *) - in map possible-rfe-lf (LF \ range(rfi-lf)) + let set-of-singleton-rfe-lf e = + map pair-to-relation (possible-rfe-noncrit-lf e) + (* Do this for each LF event e that isn't in rfi-lf *) + in map set-of-singleton-rfe-lf (LF \ range(rfi-lf)) (* Generate all rf relations for LF events *) with rfe-lf from cross(all-possible-rfe-lf) let rf-lf = rfe-lf | rfi-lf (* - * RU, i.e., spin_is_locked() returning False, is slightly different. - * We rely on the memory model to rule out cases where spin_is_locked() - * within one of the lock's critical sections returns False. + * A given RU event e may read internally from the last po-previous UL, + * or it may read from a UL event in another thread or the initial write. + * Compute the relation containing these possible edges. *) - -(* rfi for RU events: an RU may read from the last po-previous UL *) -let rfi-ru = ([UL] ; po-loc ; [RU]) \ ([UL] ; po-loc ; [LKW] ; po-loc) - -(* rfe for RU events: an RU may read from an external UL or the initial write *) -let all-possible-rfe-ru = - let possible-rfe-ru r = - let pair-to-relation p = p ++ 0 - in map pair-to-relation (((UL | IW) * {r}) & loc & ext) - in map possible-rfe-ru RU +let possible-rf-ru e = (((UL * {e}) & po-loc) \ + ([UL] ; po-loc ; [UL] ; po-loc)) | + (((UL | IW) * {e}) & loc & ext) + +(* Compute set of sets of possible rf edges for RU events *) +let all-possible-rf-ru = + (* Convert the possible-rf-ru relation for e to a set of single edges *) + let set-of-singleton-rf-ru e = + map pair-to-relation (possible-rf-ru e) + (* Do this for each RU event e *) + in map set-of-singleton-rf-ru RU (* Generate all rf relations for RU events *) -with rfe-ru from cross(all-possible-rfe-ru) -let rf-ru = rfe-ru | rfi-ru +with rf-ru from cross(all-possible-rf-ru) (* Final rf relation *) let rf = rf | rf-lf | rf-ru diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c index 233f2b6edf52..49b79cf0c5cc 100644 --- a/tools/perf/util/comm.c +++ b/tools/perf/util/comm.c @@ -86,14 +86,6 @@ static struct comm_str *comm_str__new(const char *str) return result; } -static int comm_str__cmp(const void *_lhs, const void *_rhs) -{ - const struct comm_str *lhs = *(const struct comm_str * const *)_lhs; - const struct comm_str *rhs = *(const struct comm_str * const *)_rhs; - - return strcmp(comm_str__str(lhs), comm_str__str(rhs)); -} - static int comm_str__search(const void *_key, const void *_member) { const char *key = _key; @@ -169,9 +161,24 @@ static struct comm_str *comm_strs__findnew(const char *str) } result = comm_str__new(str); if (result) { - comm_strs->strs[comm_strs->num_strs++] = result; - qsort(comm_strs->strs, comm_strs->num_strs, sizeof(struct comm_str *), - comm_str__cmp); + int low = 0, high = comm_strs->num_strs - 1; + int insert = comm_strs->num_strs; /* Default to inserting at the end. */ + + while (low <= high) { + int mid = low + (high - low) / 2; + int cmp = strcmp(comm_str__str(comm_strs->strs[mid]), str); + + if (cmp < 0) { + low = mid + 1; + } else { + high = mid - 1; + insert = mid; + } + } + memmove(&comm_strs->strs[insert + 1], &comm_strs->strs[insert], + (comm_strs->num_strs - insert) * sizeof(struct comm_str *)); + comm_strs->num_strs++; + comm_strs->strs[insert] = result; } } up_write(&comm_strs->lock); diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c index ab3d0c01dd63..a69a9c661200 100644 --- a/tools/perf/util/dsos.c +++ b/tools/perf/util/dsos.c @@ -203,11 +203,27 @@ int __dsos__add(struct dsos *dsos, struct dso *dso) dsos->dsos = temp; dsos->allocated = to_allocate; } - dsos->dsos[dsos->cnt++] = dso__get(dso); - if (dsos->cnt >= 2 && dsos->sorted) { - dsos->sorted = dsos__cmp_long_name_id_short_name(&dsos->dsos[dsos->cnt - 2], - &dsos->dsos[dsos->cnt - 1]) - <= 0; + if (!dsos->sorted) { + dsos->dsos[dsos->cnt++] = dso__get(dso); + } else { + int low = 0, high = dsos->cnt - 1; + int insert = dsos->cnt; /* Default to inserting at the end. */ + + while (low <= high) { + int mid = low + (high - low) / 2; + int cmp = dsos__cmp_long_name_id_short_name(&dsos->dsos[mid], &dso); + + if (cmp < 0) { + low = mid + 1; + } else { + high = mid - 1; + insert = mid; + } + } + memmove(&dsos->dsos[insert + 1], &dsos->dsos[insert], + (dsos->cnt - insert) * sizeof(struct dso *)); + dsos->cnt++; + dsos->dsos[insert] = dso__get(dso); } dso__set_dsos(dso, dsos); return 0; diff --git a/tools/rcu/rcu-updaters.sh b/tools/rcu/rcu-updaters.sh new file mode 100755 index 000000000000..4ef1397927bb --- /dev/null +++ b/tools/rcu/rcu-updaters.sh @@ -0,0 +1,52 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Run bpftrace to obtain a histogram of the types of primitives used to +# initiate RCU grace periods. The count associated with rcu_gp_init() +# is the number of normal (non-expedited) grace periods. +# +# Usage: rcu-updaters.sh [ duration-in-seconds ] +# +# Note that not all kernel builds have all of these functions. In those +# that do not, this script will issue a diagnostic for each that is not +# found, but continue normally for the rest of the functions. + +duration=${1} +if test -n "${duration}" +then + exitclause='interval:s:'"${duration}"' { exit(); }' +else + echo 'Hit control-C to end sample and print results.' +fi +bpftrace -e 'kprobe:kvfree_call_rcu, + kprobe:call_rcu, + kprobe:call_rcu_tasks, + kprobe:call_rcu_tasks_rude, + kprobe:call_rcu_tasks_trace, + kprobe:call_srcu, + kprobe:rcu_barrier, + kprobe:rcu_barrier_tasks, + kprobe:rcu_barrier_tasks_rude, + kprobe:rcu_barrier_tasks_trace, + kprobe:srcu_barrier, + kprobe:synchronize_rcu, + kprobe:synchronize_rcu_expedited, + kprobe:synchronize_rcu_tasks, + kprobe:synchronize_rcu_tasks_rude, + kprobe:synchronize_rcu_tasks_trace, + kprobe:synchronize_srcu, + kprobe:synchronize_srcu_expedited, + kprobe:get_state_synchronize_rcu, + kprobe:get_state_synchronize_rcu_full, + kprobe:start_poll_synchronize_rcu, + kprobe:start_poll_synchronize_rcu_expedited, + kprobe:start_poll_synchronize_rcu_full, + kprobe:start_poll_synchronize_rcu_expedited_full, + kprobe:poll_state_synchronize_rcu, + kprobe:poll_state_synchronize_rcu_full, + kprobe:cond_synchronize_rcu, + kprobe:cond_synchronize_rcu_full, + kprobe:start_poll_synchronize_srcu, + kprobe:poll_state_synchronize_srcu, + kprobe:rcu_gp_init + { @counts[func] = count(); } '"${exitclause}" diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 3482248aa344..90d5afd52dd0 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -630,11 +630,15 @@ static struct cxl_hdm *mock_cxl_setup_hdm(struct cxl_port *port, struct cxl_endpoint_dvsec_info *info) { struct cxl_hdm *cxlhdm = devm_kzalloc(&port->dev, sizeof(*cxlhdm), GFP_KERNEL); + struct device *dev = &port->dev; if (!cxlhdm) return ERR_PTR(-ENOMEM); cxlhdm->port = port; + cxlhdm->interleave_mask = ~0U; + cxlhdm->iw_cap_mask = ~0UL; + dev_set_drvdata(dev, cxlhdm); return cxlhdm; } diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index eeabd798bc3a..98b6b6a886ce 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -58,9 +58,12 @@ CONFIG_MPLS=y CONFIG_MPLS_IPTUNNEL=y CONFIG_MPLS_ROUTING=y CONFIG_MPTCP=y +CONFIG_NET_ACT_SKBMOD=y +CONFIG_NET_CLS=y CONFIG_NET_CLS_ACT=y CONFIG_NET_CLS_BPF=y CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_CLS_MATCHALL=y CONFIG_NET_FOU=y CONFIG_NET_FOU_IP_TUNNELS=y CONFIG_NET_IPGRE=y diff --git a/tools/testing/selftests/bpf/prog_tests/tc_links.c b/tools/testing/selftests/bpf/prog_tests/tc_links.c index bc9841144685..1af9ec1149aa 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_links.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_links.c @@ -9,6 +9,8 @@ #define ping_cmd "ping -q -c1 -w1 127.0.0.1 > /dev/null" #include "test_tc_link.skel.h" + +#include "netlink_helpers.h" #include "tc_helpers.h" void serial_test_tc_links_basic(void) @@ -1787,6 +1789,65 @@ void serial_test_tc_links_ingress(void) test_tc_links_ingress(BPF_TCX_INGRESS, false, false); } +struct qdisc_req { + struct nlmsghdr n; + struct tcmsg t; + char buf[1024]; +}; + +static int qdisc_replace(int ifindex, const char *kind, bool block) +{ + struct rtnl_handle rth = { .fd = -1 }; + struct qdisc_req req; + int err; + + err = rtnl_open(&rth, 0); + if (!ASSERT_OK(err, "open_rtnetlink")) + return err; + + memset(&req, 0, sizeof(req)); + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.n.nlmsg_flags = NLM_F_CREATE | NLM_F_REPLACE | NLM_F_REQUEST; + req.n.nlmsg_type = RTM_NEWQDISC; + req.t.tcm_family = AF_UNSPEC; + req.t.tcm_ifindex = ifindex; + req.t.tcm_parent = 0xfffffff1; + + addattr_l(&req.n, sizeof(req), TCA_KIND, kind, strlen(kind) + 1); + if (block) + addattr32(&req.n, sizeof(req), TCA_INGRESS_BLOCK, 1); + + err = rtnl_talk(&rth, &req.n, NULL); + ASSERT_OK(err, "talk_rtnetlink"); + rtnl_close(&rth); + return err; +} + +void serial_test_tc_links_dev_chain0(void) +{ + int err, ifindex; + + ASSERT_OK(system("ip link add dev foo type veth peer name bar"), "add veth"); + ifindex = if_nametoindex("foo"); + ASSERT_NEQ(ifindex, 0, "non_zero_ifindex"); + err = qdisc_replace(ifindex, "ingress", true); + if (!ASSERT_OK(err, "attaching ingress")) + goto cleanup; + ASSERT_OK(system("tc filter add block 1 matchall action skbmod swap mac"), "add block"); + err = qdisc_replace(ifindex, "clsact", false); + if (!ASSERT_OK(err, "attaching clsact")) + goto cleanup; + /* Heuristic: kern_sync_rcu() alone does not work; a wait-time of ~5s + * triggered the issue without the fix reliably 100% of the time. + */ + sleep(5); + ASSERT_OK(system("tc filter add dev foo ingress matchall action skbmod swap mac"), "add filter"); +cleanup: + ASSERT_OK(system("ip link del dev foo"), "del veth"); + ASSERT_EQ(if_nametoindex("foo"), 0, "foo removed"); + ASSERT_EQ(if_nametoindex("bar"), 0, "bar removed"); +} + static void test_tc_links_dev_mixed(int target) { LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c new file mode 100644 index 000000000000..871d16cb95cf --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <sched.h> +#include <test_progs.h> +#include <pthread.h> +#include <network_helpers.h> + +#include "timer_lockup.skel.h" + +static long cpu; +static int *timer1_err; +static int *timer2_err; +static bool skip; + +volatile int k = 0; + +static void *timer_lockup_thread(void *arg) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1000, + ); + int i, prog_fd = *(int *)arg; + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset); + ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset), + &cpuset), + "cpu affinity"); + + for (i = 0; !READ_ONCE(*timer1_err) && !READ_ONCE(*timer2_err); i++) { + bpf_prog_test_run_opts(prog_fd, &opts); + /* Skip the test if we can't reproduce the race in a reasonable + * amount of time. + */ + if (i > 50) { + WRITE_ONCE(skip, true); + break; + } + } + + return NULL; +} + +void test_timer_lockup(void) +{ + int timer1_prog, timer2_prog; + struct timer_lockup *skel; + pthread_t thrds[2]; + void *ret; + + skel = timer_lockup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) + return; + + timer1_prog = bpf_program__fd(skel->progs.timer1_prog); + timer2_prog = bpf_program__fd(skel->progs.timer2_prog); + + timer1_err = &skel->bss->timer1_err; + timer2_err = &skel->bss->timer2_err; + + if (!ASSERT_OK(pthread_create(&thrds[0], NULL, timer_lockup_thread, + &timer1_prog), + "pthread_create thread1")) + goto out; + if (!ASSERT_OK(pthread_create(&thrds[1], NULL, timer_lockup_thread, + &timer2_prog), + "pthread_create thread2")) { + pthread_exit(&thrds[0]); + goto out; + } + + pthread_join(thrds[1], &ret); + pthread_join(thrds[0], &ret); + + if (skip) { + test__skip(); + goto out; + } + + if (*timer1_err != -EDEADLK && *timer1_err != 0) + ASSERT_FAIL("timer1_err bad value"); + if (*timer2_err != -EDEADLK && *timer2_err != 0) + ASSERT_FAIL("timer2_err bad value"); +out: + timer_lockup__destroy(skel); + return; +} diff --git a/tools/testing/selftests/bpf/progs/timer_lockup.c b/tools/testing/selftests/bpf/progs/timer_lockup.c new file mode 100644 index 000000000000..3e520133281e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_lockup.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <time.h> +#include <errno.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer1_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer2_map SEC(".maps"); + +int timer1_err; +int timer2_err; + +static int timer_cb1(void *map, int *k, struct elem *v) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer2_map, &key); + if (timer) + timer2_err = bpf_timer_cancel(timer); + + return 0; +} + +static int timer_cb2(void *map, int *k, struct elem *v) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer1_map, &key); + if (timer) + timer1_err = bpf_timer_cancel(timer); + + return 0; +} + +SEC("tc") +int timer1_prog(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer1_map, &key); + if (timer) { + bpf_timer_init(timer, &timer1_map, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, timer_cb1); + bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN); + } + + return 0; +} + +SEC("tc") +int timer2_prog(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer2_map, &key); + if (timer) { + bpf_timer_init(timer, &timer2_map, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, timer_cb2); + bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN); + } + + return 0; +} diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index 2732e0b29271..952e4448bf07 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -1,11 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only -test_memcontrol test_core -test_freezer -test_kmem -test_kill test_cpu test_cpuset -test_zswap +test_freezer test_hugetlb_memcg +test_kill +test_kmem +test_memcontrol +test_pids +test_zswap wait_inotify diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 16461dc0ffdf..1b897152bab6 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -6,26 +6,29 @@ all: ${HELPER_PROGS} TEST_FILES := with_stress.sh TEST_PROGS := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh TEST_GEN_FILES := wait_inotify -TEST_GEN_PROGS = test_memcontrol -TEST_GEN_PROGS += test_kmem -TEST_GEN_PROGS += test_core -TEST_GEN_PROGS += test_freezer -TEST_GEN_PROGS += test_kill +# Keep the lists lexicographically sorted +TEST_GEN_PROGS = test_core TEST_GEN_PROGS += test_cpu TEST_GEN_PROGS += test_cpuset -TEST_GEN_PROGS += test_zswap +TEST_GEN_PROGS += test_freezer TEST_GEN_PROGS += test_hugetlb_memcg +TEST_GEN_PROGS += test_kill +TEST_GEN_PROGS += test_kmem +TEST_GEN_PROGS += test_memcontrol +TEST_GEN_PROGS += test_pids +TEST_GEN_PROGS += test_zswap LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h include ../lib.mk -$(OUTPUT)/test_memcontrol: cgroup_util.c -$(OUTPUT)/test_kmem: cgroup_util.c $(OUTPUT)/test_core: cgroup_util.c -$(OUTPUT)/test_freezer: cgroup_util.c -$(OUTPUT)/test_kill: cgroup_util.c $(OUTPUT)/test_cpu: cgroup_util.c $(OUTPUT)/test_cpuset: cgroup_util.c -$(OUTPUT)/test_zswap: cgroup_util.c +$(OUTPUT)/test_freezer: cgroup_util.c $(OUTPUT)/test_hugetlb_memcg: cgroup_util.c +$(OUTPUT)/test_kill: cgroup_util.c +$(OUTPUT)/test_kmem: cgroup_util.c +$(OUTPUT)/test_memcontrol: cgroup_util.c +$(OUTPUT)/test_pids: cgroup_util.c +$(OUTPUT)/test_zswap: cgroup_util.c diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index b5eb1be2248c..7c08cc153367 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -28,6 +28,14 @@ CPULIST=$(cat $CGROUP2/cpuset.cpus.effective) NR_CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//") [[ $NR_CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!" +# Check to see if /dev/console exists and is writable +if [[ -c /dev/console && -w /dev/console ]] +then + CONSOLE=/dev/console +else + CONSOLE=/dev/null +fi + # Set verbose flag and delay factor PROG=$1 VERBOSE=0 @@ -103,8 +111,8 @@ console_msg() { MSG=$1 echo "$MSG" - echo "" > /dev/console - echo "$MSG" > /dev/console + echo "" > $CONSOLE + echo "$MSG" > $CONSOLE pause 0.01 } @@ -161,6 +169,14 @@ test_add_proc() # T = put a task into cgroup # O<c>=<v> = Write <v> to CPU online file of <c> # +# ECPUs - effective CPUs of cpusets +# Pstate - partition root state +# ISOLCPUS - isolated CPUs (<icpus>[,<icpus2>]) +# +# Note that if there are 2 fields in ISOLCPUS, the first one is for +# sched-debug matching which includes offline CPUs and single-CPU partitions +# while the second one is for matching cpuset.cpus.isolated. +# SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1" TEST_MATRIX=( # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS @@ -220,23 +236,29 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3:P2 . . 0 A1:0-1,A2:2-3,A3:2-3 A1:P0,A2:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2,A2:3,A3:3 A1:P0,A2:P2 3" " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" - " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-2,A2:1-2,A3:3 A1:P0,A3:P2 3" + " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3" " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3,A2:1-3,A3:2-3,B1:2-3 A1:P0,A3:P0,B1:P-2" " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3,B1:4 A3:P2,B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3,B1:4 A3:P2,B1:P2 2-4" " C0-3:S+ C1-3:S+ C2-3 C4 X1-3 X1-3:P2 P2 . 0 A2:1,A3:2-3 A2:P2,A3:P2 1-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2:C4-5 0 A3:2-3,B1:4-5 A3:P2,B1:P2 2-5" + " C4:X0-3:S+ X1-3:S+ X2-3 . . P2 . . 0 A1:4,A2:1-3,A3:1-3 A2:P2 1-3" + " C4:X0-3:S+ X1-3:S+ X2-3 . . . P2 . 0 A1:4,A2:4,A3:2-3 A3:P2 2-3" # Nested remote/local partition tests " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4-5 \ A1:P0,A2:P1,A3:P2,B1:P1 2-3" " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:,A3:2-3,B1:4 \ A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3" + " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 . P1 0 A1:0-1,A2:2-3,A3:2-3,B1:4 \ + A1:P0,A2:P1,A3:P0,B1:P1" " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1,A2:2,A3:3,B1:4 \ A1:P0,A2:P1,A3:P2,B1:P1 2-4,3" " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1,A2:2-3,A3:4 \ A1:P0,A2:P2,A3:P1 2-4,2-3" + " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X3-4:P1 . 0 A1:0-1,A2:2,A3:3-4 \ + A1:P0,A2:P2,A3:P1 2" " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \ . . X5 . . 0 A1:0-4,A2:1-4,A3:2-4 \ A1:P0,A2:P-2,A3:P-1" @@ -262,8 +284,8 @@ TEST_MATRIX=( . . X2-3 P2 . . 0 A1:0-2,A2:3,XA2:3 A2:P2 3" # Invalid to valid local partition direct transition tests - " C1-3:S+:P2 C2-3:X1:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3" - " C1-3:S+:P2 C2-3:X1:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3" + " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:1-3:XA2: A1:P2,A2:P-2 1-3" + " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3" " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4,B1:4-6 A1:P-2,B1:P0" " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3,B1:4-6 A1:P2,B1:P0 0-3" " C0-3:P2 . . C3-5:C4-5 . . . . 0 A1:0-3,B1:4-5 A1:P2,B1:P0 0-3" @@ -274,32 +296,26 @@ TEST_MATRIX=( " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ . . X4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ - . . C4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" + . . C4:X . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" # Local partition CPU change tests " C0-5:S+:P2 C4-5:S+:P1 . . . C3-5 . . 0 A1:0-2,A2:3-5 A1:P2,A2:P1 0-2" " C0-5:S+:P2 C4-5:S+:P1 . . C1-5 . . . 0 A1:1-3,A2:4-5 A1:P2,A2:P1 1-3" # cpus_allowed/exclusive_cpus update tests " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ - . C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \ + . X:C4 . P2 . 0 A1:4,A2:4,XA2:,XA3:,A3:4 \ A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . X1 . P2 . 0 A1:0-3,A2:1-3,XA1:1,XA2:,XA3:,A3:2-3 \ A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ - . . C3 P2 . 0 A1:0-2,A2:0-2,XA2:3,XA3:3,A3:3 \ - A1:P0,A3:P2 3" - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ . . X3 P2 . 0 A1:0-2,A2:1-2,XA2:3,XA3:3,A3:3 \ A1:P0,A3:P2 3" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ . . X3 . . 0 A1:0-3,A2:1-3,XA2:3,XA3:3,A3:2-3 \ A1:P0,A3:P-2" " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ - . . C3 . . 0 A1:0-3,A2:3,XA2:3,XA3:3,A3:3 \ - A1:P0,A3:P-2" - " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \ - . C4 . . . 0 A1:4,A2:4,A3:4,XA1:,XA2:,XA3 \ + . X4 . . . 0 A1:0-3,A2:1-3,A3:2-3,XA1:4,XA2:,XA3 \ A1:P0,A3:P-2" # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS @@ -346,6 +362,9 @@ TEST_MATRIX=( " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P-1,B1:P-1" " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2,B1:2-3 A1:P0,B1:P-1" + # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it + " C0-3 . . C4-5 X5 . . . 0 A1:0-3,B1:4-5" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: @@ -355,6 +374,9 @@ TEST_MATRIX=( # Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3,B1:4-5" + + # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive + " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3,B1:4-5" ) # @@ -556,14 +578,15 @@ check_cgroup_states() do set -- $(echo $CHK | sed -e "s/:/ /g") CGRP=$1 + CGRP_DIR=$CGRP STATE=$2 FILE= EVAL=$(expr substr $STATE 2 2) - [[ $CGRP = A2 ]] && CGRP=A1/A2 - [[ $CGRP = A3 ]] && CGRP=A1/A2/A3 + [[ $CGRP = A2 ]] && CGRP_DIR=A1/A2 + [[ $CGRP = A3 ]] && CGRP_DIR=A1/A2/A3 case $STATE in - P*) FILE=$CGRP/cpuset.cpus.partition + P*) FILE=$CGRP_DIR/cpuset.cpus.partition ;; *) echo "Unknown state: $STATE!" exit 1 @@ -587,6 +610,16 @@ check_cgroup_states() ;; esac [[ $EVAL != $VAL ]] && return 1 + + # + # For root partition, dump sched-domains info to console if + # verbose mode set for manual comparison with sched debug info. + # + [[ $VAL -eq 1 && $VERBOSE -gt 0 ]] && { + DOMS=$(cat $CGRP_DIR/cpuset.cpus.effective) + [[ -n "$DOMS" ]] && + echo " [$CGRP] sched-domain: $DOMS" > $CONSOLE + } done return 0 } @@ -694,9 +727,9 @@ null_isolcpus_check() [[ $VERBOSE -gt 0 ]] || return 0 # Retry a few times before printing error RETRY=0 - while [[ $RETRY -lt 5 ]] + while [[ $RETRY -lt 8 ]] do - pause 0.01 + pause 0.02 check_isolcpus "." [[ $? -eq 0 ]] && return 0 ((RETRY++)) @@ -726,7 +759,7 @@ run_state_test() while [[ $I -lt $CNT ]] do - echo "Running test $I ..." > /dev/console + echo "Running test $I ..." > $CONSOLE [[ $VERBOSE -gt 1 ]] && { echo "" eval echo \${$TEST[$I]} @@ -783,7 +816,7 @@ run_state_test() while [[ $NEWLIST != $CPULIST && $RETRY -lt 8 ]] do # Wait a bit longer & recheck a few times - pause 0.01 + pause 0.02 ((RETRY++)) NEWLIST=$(cat cpuset.cpus.effective) done diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c new file mode 100644 index 000000000000..9ecb83c6cc5c --- /dev/null +++ b/tools/testing/selftests/cgroup/test_pids.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <errno.h> +#include <linux/limits.h> +#include <signal.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include "../kselftest.h" +#include "cgroup_util.h" + +static int run_success(const char *cgroup, void *arg) +{ + return 0; +} + +static int run_pause(const char *cgroup, void *arg) +{ + return pause(); +} + +/* + * This test checks that pids.max prevents forking new children above the + * specified limit in the cgroup. + */ +static int test_pids_max(const char *root) +{ + int ret = KSFT_FAIL; + char *cg_pids; + int pid; + + cg_pids = cg_name(root, "pids_test"); + if (!cg_pids) + goto cleanup; + + if (cg_create(cg_pids)) + goto cleanup; + + if (cg_read_strcmp(cg_pids, "pids.max", "max\n")) + goto cleanup; + + if (cg_write(cg_pids, "pids.max", "2")) + goto cleanup; + + if (cg_enter_current(cg_pids)) + goto cleanup; + + pid = cg_run_nowait(cg_pids, run_pause, NULL); + if (pid < 0) + goto cleanup; + + if (cg_run_nowait(cg_pids, run_success, NULL) != -1 || errno != EAGAIN) + goto cleanup; + + if (kill(pid, SIGINT)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + cg_destroy(cg_pids); + free(cg_pids); + + return ret; +} + +/* + * This test checks that pids.events are counted in cgroup associated with pids.max + */ +static int test_pids_events(const char *root) +{ + int ret = KSFT_FAIL; + char *cg_parent = NULL, *cg_child = NULL; + int pid; + + cg_parent = cg_name(root, "pids_parent"); + cg_child = cg_name(cg_parent, "pids_child"); + if (!cg_parent || !cg_child) + goto cleanup; + + if (cg_create(cg_parent)) + goto cleanup; + if (cg_write(cg_parent, "cgroup.subtree_control", "+pids")) + goto cleanup; + if (cg_create(cg_child)) + goto cleanup; + + if (cg_write(cg_parent, "pids.max", "2")) + goto cleanup; + + if (cg_read_strcmp(cg_child, "pids.max", "max\n")) + goto cleanup; + + if (cg_enter_current(cg_child)) + goto cleanup; + + pid = cg_run_nowait(cg_child, run_pause, NULL); + if (pid < 0) + goto cleanup; + + if (cg_run_nowait(cg_child, run_success, NULL) != -1 || errno != EAGAIN) + goto cleanup; + + if (kill(pid, SIGINT)) + goto cleanup; + + if (cg_read_key_long(cg_child, "pids.events", "max ") != 0) + goto cleanup; + if (cg_read_key_long(cg_parent, "pids.events", "max ") != 1) + goto cleanup; + + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + if (cg_child) + cg_destroy(cg_child); + if (cg_parent) + cg_destroy(cg_parent); + free(cg_child); + free(cg_parent); + + return ret; +} + + + +#define T(x) { x, #x } +struct pids_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { + T(test_pids_max), + T(test_pids_events), +}; +#undef T + +int main(int argc, char **argv) +{ + char root[PATH_MAX]; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); + if (cg_find_unified_root(root, sizeof(root), NULL)) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + /* + * Check that pids controller is available: + * pids is listed in cgroup.controllers + */ + if (cg_read_strstr(root, "cgroup.controllers", "pids")) + ksft_exit_skip("pids controller isn't available\n"); + + if (cg_read_strstr(root, "cgroup.subtree_control", "pids")) + if (cg_write(root, "cgroup.subtree_control", "+pids")) + ksft_exit_skip("Failed to set pids controller\n"); + + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + ksft_finished(); +} diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile index 07a0d5b545ca..3af3136e35a4 100644 --- a/tools/testing/selftests/filesystems/statmount/Makefile +++ b/tools/testing/selftests/filesystems/statmount/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) -TEST_GEN_PROGS := statmount_test +TEST_GEN_PROGS := statmount_test statmount_test_ns include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h new file mode 100644 index 000000000000..f4294bab9d73 --- /dev/null +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __STATMOUNT_H +#define __STATMOUNT_H + +#include <stdint.h> +#include <linux/mount.h> +#include <asm/unistd.h> + +static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, + struct statmount *buf, size_t bufsize, + unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER0, + .mnt_id = mnt_id, + .param = mask, + }; + + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } + + return syscall(__NR_statmount, &req, buf, bufsize, flags); +} + +static ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id, + uint64_t last_mnt_id, uint64_t list[], size_t num, + unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER0, + .mnt_id = mnt_id, + .param = last_mnt_id, + }; + + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } + + return syscall(__NR_listmount, &req, list, num, flags); +} + +#endif /* __STATMOUNT_H */ diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index e8c019d72cbf..c773334bbcc9 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -4,17 +4,15 @@ #include <assert.h> #include <stddef.h> -#include <stdint.h> #include <sched.h> #include <fcntl.h> #include <sys/param.h> #include <sys/mount.h> #include <sys/stat.h> #include <sys/statfs.h> -#include <linux/mount.h> #include <linux/stat.h> -#include <asm/unistd.h> +#include "statmount.h" #include "../../kselftest.h" static const char *const known_fs[] = { @@ -36,18 +34,6 @@ static const char *const known_fs[] = { "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; -static int statmount(uint64_t mnt_id, uint64_t mask, struct statmount *buf, - size_t bufsize, unsigned int flags) -{ - struct mnt_id_req req = { - .size = MNT_ID_REQ_SIZE_VER0, - .mnt_id = mnt_id, - .param = mask, - }; - - return syscall(__NR_statmount, &req, buf, bufsize, flags); -} - static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) { size_t bufsize = 1 << 15; @@ -56,7 +42,7 @@ static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigne int ret; for (;;) { - ret = statmount(mnt_id, mask, tmp, bufsize, flags); + ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags); if (ret != -1) break; if (tofree) @@ -121,7 +107,7 @@ static char root_mntpoint[] = "/tmp/statmount_test_root.XXXXXX"; static int orig_root; static uint64_t root_id, parent_id; static uint32_t old_root_id, old_parent_id; - +static FILE *f_mountinfo; static void cleanup_namespace(void) { @@ -146,7 +132,7 @@ static void setup_namespace(void) uid_t uid = getuid(); gid_t gid = getgid(); - ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); if (ret == -1) ksft_exit_fail_msg("unsharing mountns and userns: %s\n", strerror(errno)); @@ -157,6 +143,11 @@ static void setup_namespace(void) sprintf(buf, "0 %d 1", gid); write_file("/proc/self/gid_map", buf); + f_mountinfo = fopen("/proc/self/mountinfo", "re"); + if (!f_mountinfo) + ksft_exit_fail_msg("failed to open mountinfo: %s\n", + strerror(errno)); + ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); if (ret == -1) ksft_exit_fail_msg("making mount tree private: %s\n", @@ -216,25 +207,13 @@ static int setup_mount_tree(int log2_num) return 0; } -static ssize_t listmount(uint64_t mnt_id, uint64_t last_mnt_id, - uint64_t list[], size_t num, unsigned int flags) -{ - struct mnt_id_req req = { - .size = MNT_ID_REQ_SIZE_VER0, - .mnt_id = mnt_id, - .param = last_mnt_id, - }; - - return syscall(__NR_listmount, &req, list, num, flags); -} - static void test_listmount_empty_root(void) { ssize_t res; const unsigned int size = 32; uint64_t list[size]; - res = listmount(LSMT_ROOT, 0, list, size, 0); + res = listmount(LSMT_ROOT, 0, 0, list, size, 0); if (res == -1) { ksft_test_result_fail("listmount: %s\n", strerror(errno)); return; @@ -259,7 +238,7 @@ static void test_statmount_zero_mask(void) struct statmount sm; int ret; - ret = statmount(root_id, 0, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount zero mask: %s\n", strerror(errno)); @@ -285,7 +264,7 @@ static void test_statmount_mnt_basic(void) int ret; uint64_t mask = STATMOUNT_MNT_BASIC; - ret = statmount(root_id, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount mnt basic: %s\n", strerror(errno)); @@ -345,7 +324,7 @@ static void test_statmount_sb_basic(void) struct statx sx; struct statfs sf; - ret = statmount(root_id, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount sb basic: %s\n", strerror(errno)); @@ -470,6 +449,88 @@ static void test_statmount_fs_type(void) free(sm); } +static void test_statmount_mnt_opts(void) +{ + struct statmount *sm; + const char *statmount_opts; + char *line = NULL; + size_t len = 0; + + sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, + 0); + if (!sm) { + ksft_test_result_fail("statmount mnt opts: %s\n", + strerror(errno)); + return; + } + + while (getline(&line, &len, f_mountinfo) != -1) { + int i; + char *p, *p2; + unsigned int old_mnt_id; + + old_mnt_id = atoi(line); + if (old_mnt_id != sm->mnt_id_old) + continue; + + for (p = line, i = 0; p && i < 5; i++) + p = strchr(p + 1, ' '); + if (!p) + continue; + + p2 = strchr(p + 1, ' '); + if (!p2) + continue; + *p2 = '\0'; + p = strchr(p2 + 1, '-'); + if (!p) + continue; + for (p++, i = 0; p && i < 2; i++) + p = strchr(p + 1, ' '); + if (!p) + continue; + p++; + + /* skip generic superblock options */ + if (strncmp(p, "ro", 2) == 0) + p += 2; + else if (strncmp(p, "rw", 2) == 0) + p += 2; + if (*p == ',') + p++; + if (strncmp(p, "sync", 4) == 0) + p += 4; + if (*p == ',') + p++; + if (strncmp(p, "dirsync", 7) == 0) + p += 7; + if (*p == ',') + p++; + if (strncmp(p, "lazytime", 8) == 0) + p += 8; + if (*p == ',') + p++; + p2 = strrchr(p, '\n'); + if (p2) + *p2 = '\0'; + + statmount_opts = sm->str + sm->mnt_opts; + if (strcmp(statmount_opts, p) != 0) + ksft_test_result_fail( + "unexpected mount options: '%s' != '%s'\n", + statmount_opts, p); + else + ksft_test_result_pass("statmount mount options\n"); + free(sm); + free(line); + return; + } + + ksft_test_result_fail("didnt't find mount entry\n"); + free(sm); + free(line); +} + static void test_statmount_string(uint64_t mask, size_t off, const char *name) { struct statmount *sm; @@ -506,14 +567,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) exactsize = sm->size; shortsize = sizeof(*sm) + i; - ret = statmount(root_id, mask, sm, exactsize, 0); + ret = statmount(root_id, 0, mask, sm, exactsize, 0); if (ret == -1) { ksft_test_result_fail("statmount exact size: %s\n", strerror(errno)); goto out; } errno = 0; - ret = statmount(root_id, mask, sm, shortsize, 0); + ret = statmount(root_id, 0, mask, sm, shortsize, 0); if (ret != -1 || errno != EOVERFLOW) { ksft_test_result_fail("should have failed with EOVERFLOW: %s\n", strerror(errno)); @@ -541,7 +602,7 @@ static void test_listmount_tree(void) if (res == -1) return; - num = res = listmount(LSMT_ROOT, 0, list, size, 0); + num = res = listmount(LSMT_ROOT, 0, 0, list, size, 0); if (res == -1) { ksft_test_result_fail("listmount: %s\n", strerror(errno)); return; @@ -553,7 +614,7 @@ static void test_listmount_tree(void) } for (i = 0; i < size - step;) { - res = listmount(LSMT_ROOT, i ? list2[i - 1] : 0, list2 + i, step, 0); + res = listmount(LSMT_ROOT, 0, i ? list2[i - 1] : 0, list2 + i, step, 0); if (res == -1) ksft_test_result_fail("short listmount: %s\n", strerror(errno)); @@ -585,18 +646,18 @@ int main(void) int ret; uint64_t all_mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC | STATMOUNT_PROPAGATE_FROM | STATMOUNT_MNT_ROOT | - STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE; + STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE | STATMOUNT_MNT_NS_ID; ksft_print_header(); - ret = statmount(0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); setup_namespace(); - ksft_set_plan(14); + ksft_set_plan(15); test_listmount_empty_root(); test_statmount_zero_mask(); test_statmount_mnt_basic(); @@ -604,6 +665,7 @@ int main(void) test_statmount_mnt_root(); test_statmount_mnt_point(); test_statmount_fs_type(); + test_statmount_mnt_opts(); test_statmount_string(STATMOUNT_MNT_ROOT, str_off(mnt_root), "mount root"); test_statmount_string(STATMOUNT_MNT_POINT, str_off(mnt_point), "mount point"); test_statmount_string(STATMOUNT_FS_TYPE, str_off(fs_type), "fs type"); diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c new file mode 100644 index 000000000000..e044f5fc57fd --- /dev/null +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE + +#include <assert.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdlib.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <linux/nsfs.h> +#include <linux/stat.h> + +#include "statmount.h" +#include "../../kselftest.h" + +#define NSID_PASS 0 +#define NSID_FAIL 1 +#define NSID_SKIP 2 +#define NSID_ERROR 3 + +static void handle_result(int ret, const char *testname) +{ + if (ret == NSID_PASS) + ksft_test_result_pass("%s\n", testname); + else if (ret == NSID_FAIL) + ksft_test_result_fail("%s\n", testname); + else if (ret == NSID_ERROR) + ksft_exit_fail_msg("%s\n", testname); + else + ksft_test_result_skip("%s\n", testname); +} + +static inline int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + ksft_print_msg("waitpid returned -1, errno=%d\n", errno); + return -1; + } + + if (!WIFEXITED(status)) { + ksft_print_msg( + "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n", + WIFSIGNALED(status), WTERMSIG(status)); + return -1; + } + + ret = WEXITSTATUS(status); + return ret; +} + +static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id) +{ + int fd = open(mnt_ns, O_RDONLY); + + if (fd < 0) { + ksft_print_msg("failed to open for ns %s: %s\n", + mnt_ns, strerror(errno)); + sleep(60); + return NSID_ERROR; + } + + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) { + ksft_print_msg("failed to get the nsid for ns %s: %s\n", + mnt_ns, strerror(errno)); + return NSID_ERROR; + } + close(fd); + return NSID_PASS; +} + +static int get_mnt_id(const char *path, uint64_t *mnt_id) +{ + struct statx sx; + int ret; + + ret = statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx); + if (ret == -1) { + ksft_print_msg("retrieving unique mount ID for %s: %s\n", path, + strerror(errno)); + return NSID_ERROR; + } + + if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) { + ksft_print_msg("no unique mount ID available for %s\n", path); + return NSID_ERROR; + } + + *mnt_id = sx.stx_mnt_id; + return NSID_PASS; +} + +static int write_file(const char *path, const char *val) +{ + int fd = open(path, O_WRONLY); + size_t len = strlen(val); + int ret; + + if (fd == -1) { + ksft_print_msg("opening %s for write: %s\n", path, strerror(errno)); + return NSID_ERROR; + } + + ret = write(fd, val, len); + if (ret == -1) { + ksft_print_msg("writing to %s: %s\n", path, strerror(errno)); + return NSID_ERROR; + } + if (ret != len) { + ksft_print_msg("short write to %s\n", path); + return NSID_ERROR; + } + + ret = close(fd); + if (ret == -1) { + ksft_print_msg("closing %s\n", path); + return NSID_ERROR; + } + + return NSID_PASS; +} + +static int setup_namespace(void) +{ + int ret; + char buf[32]; + uid_t uid = getuid(); + gid_t gid = getgid(); + + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); + if (ret == -1) + ksft_exit_fail_msg("unsharing mountns and userns: %s\n", + strerror(errno)); + + sprintf(buf, "0 %d 1", uid); + ret = write_file("/proc/self/uid_map", buf); + if (ret != NSID_PASS) + return ret; + ret = write_file("/proc/self/setgroups", "deny"); + if (ret != NSID_PASS) + return ret; + sprintf(buf, "0 %d 1", gid); + ret = write_file("/proc/self/gid_map", buf); + if (ret != NSID_PASS) + return ret; + + ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); + if (ret == -1) { + ksft_print_msg("making mount tree private: %s\n", + strerror(errno)); + return NSID_ERROR; + } + + return NSID_PASS; +} + +static int _test_statmount_mnt_ns_id(void) +{ + struct statmount sm; + uint64_t mnt_ns_id; + uint64_t root_id; + int ret; + + ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id); + if (ret != NSID_PASS) + return ret; + + ret = get_mnt_id("/", &root_id); + if (ret != NSID_PASS) + return ret; + + ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + return NSID_FAIL; + } + if (sm.mask != STATMOUNT_MNT_NS_ID) { + ksft_print_msg("statmount mnt ns id unavailable\n"); + return NSID_SKIP; + } + + if (sm.mnt_ns_id != mnt_ns_id) { + ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n", + (unsigned long long)sm.mnt_ns_id, + (unsigned long long)mnt_ns_id); + return NSID_FAIL; + } + + return NSID_PASS; +} + +static void test_statmount_mnt_ns_id(void) +{ + pid_t pid; + int ret; + + pid = fork(); + if (pid < 0) + ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno)); + + /* We're the original pid, wait for the result. */ + if (pid != 0) { + ret = wait_for_pid(pid); + handle_result(ret, "test statmount ns id"); + return; + } + + ret = setup_namespace(); + if (ret != NSID_PASS) + exit(ret); + ret = _test_statmount_mnt_ns_id(); + exit(ret); +} + +static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts) +{ + uint64_t list[256]; + uint64_t mnt_ns_id; + uint64_t nr_mounts; + char buf[256]; + int ret; + + /* Get the mount ns id for our child. */ + snprintf(buf, sizeof(buf), "/proc/%lu/ns/mnt", (unsigned long)pid); + ret = get_mnt_ns_id(buf, &mnt_ns_id); + + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); + if (nr_mounts == (uint64_t)-1) { + ksft_print_msg("listmount: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (nr_mounts != child_nr_mounts) { + ksft_print_msg("listmount results is %zi != %zi\n", nr_mounts, + child_nr_mounts); + return NSID_FAIL; + } + + /* Validate that all of our entries match our mnt_ns_id. */ + for (int i = 0; i < nr_mounts; i++) { + struct statmount sm; + + ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm, + sizeof(sm), 0); + if (ret < 0) { + ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (sm.mask != STATMOUNT_MNT_NS_ID) { + ksft_print_msg("statmount mnt ns id unavailable\n"); + return NSID_SKIP; + } + + if (sm.mnt_ns_id != mnt_ns_id) { + ksft_print_msg("listmount gave us the wrong ns id: 0x%llx != 0x%llx\n", + (unsigned long long)sm.mnt_ns_id, + (unsigned long long)mnt_ns_id); + return NSID_FAIL; + } + } + + return NSID_PASS; +} + +static void test_listmount_ns(void) +{ + uint64_t nr_mounts; + char pval; + int child_ready_pipe[2]; + int parent_ready_pipe[2]; + pid_t pid; + int ret, child_ret; + + if (pipe(child_ready_pipe) < 0) + ksft_exit_fail_msg("failed to create the child pipe: %s\n", + strerror(errno)); + if (pipe(parent_ready_pipe) < 0) + ksft_exit_fail_msg("failed to create the parent pipe: %s\n", + strerror(errno)); + + pid = fork(); + if (pid < 0) + ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno)); + + if (pid == 0) { + char cval; + uint64_t list[256]; + + close(child_ready_pipe[0]); + close(parent_ready_pipe[1]); + + ret = setup_namespace(); + if (ret != NSID_PASS) + exit(ret); + + nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 256, 0); + if (nr_mounts == (uint64_t)-1) { + ksft_print_msg("listmount: %s\n", strerror(errno)); + exit(NSID_FAIL); + } + + /* + * Tell our parent how many mounts we have, and then wait for it + * to tell us we're done. + */ + write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)); + read(parent_ready_pipe[0], &cval, sizeof(cval)); + exit(NSID_PASS); + } + + close(child_ready_pipe[1]); + close(parent_ready_pipe[0]); + + /* Wait until the child has created everything. */ + if (read(child_ready_pipe[0], &nr_mounts, sizeof(nr_mounts)) != + sizeof(nr_mounts)) + ret = NSID_ERROR; + + ret = validate_external_listmount(pid, nr_mounts); + + if (write(parent_ready_pipe[1], &pval, sizeof(pval)) != sizeof(pval)) + ret = NSID_ERROR; + + child_ret = wait_for_pid(pid); + if (child_ret != NSID_PASS) + ret = child_ret; + handle_result(ret, "test listmount ns id"); +} + +int main(void) +{ + int ret; + + ksft_print_header(); + ret = statmount(0, 0, 0, NULL, 0, 0); + assert(ret == -1); + if (errno == ENOSYS) + ksft_exit_skip("statmount() syscall not supported\n"); + + ksft_set_plan(2); + test_statmount_mnt_ns_id(); + test_listmount_ns(); + + if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) + ksft_exit_fail(); + else + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 76c2a6945d3e..b8967b6e29d5 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -168,15 +168,7 @@ static inline __printf(1, 2) void ksft_print_msg(const char *msg, ...) static inline void ksft_perror(const char *msg) { -#ifndef NOLIBC ksft_print_msg("%s: %s (%d)\n", msg, strerror(errno), errno); -#else - /* - * nolibc doesn't provide strerror() and it seems - * inappropriate to add one, just print the errno. - */ - ksft_print_msg("%s: %d)\n", msg, errno); -#endif } static inline __printf(1, 2) void ksft_test_result_pass(const char *msg, ...) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index b634969cbb6f..40723a6a083f 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -66,8 +66,6 @@ #include <sys/wait.h> #include <unistd.h> #include <setjmp.h> -#include <syscall.h> -#include <linux/sched.h> #include "kselftest.h" @@ -82,17 +80,6 @@ # define TH_LOG_ENABLED 1 #endif -/* Wait for the child process to end but without sharing memory mapping. */ -static inline pid_t clone3_vfork(void) -{ - struct clone_args args = { - .flags = CLONE_VFORK, - .exit_signal = SIGCHLD, - }; - - return syscall(__NR_clone3, &args, sizeof(args)); -} - /** * TH_LOG() * @@ -437,7 +424,7 @@ static inline pid_t clone3_vfork(void) } \ if (setjmp(_metadata->env) == 0) { \ /* _metadata and potentially self are shared with all forks. */ \ - child = clone3_vfork(); \ + child = fork(); \ if (child == 0) { \ fixture_name##_setup(_metadata, self, variant->data); \ /* Let setup failure terminate early. */ \ @@ -1016,7 +1003,14 @@ void __wait_for_test(struct __test_metadata *t) .sa_flags = SA_SIGINFO, }; struct sigaction saved_action; - int status; + /* + * Sets status so that WIFEXITED(status) returns true and + * WEXITSTATUS(status) returns KSFT_FAIL. This safe default value + * should never be evaluated because of the waitpid(2) check and + * SIGALRM handling. + */ + int status = KSFT_FAIL << 8; + int child; if (sigaction(SIGALRM, &action, &saved_action)) { t->exit_code = KSFT_FAIL; @@ -1028,7 +1022,15 @@ void __wait_for_test(struct __test_metadata *t) __active_test = t; t->timed_out = false; alarm(t->timeout); - waitpid(t->pid, &status, 0); + child = waitpid(t->pid, &status, 0); + if (child == -1 && errno != EINTR) { + t->exit_code = KSFT_FAIL; + fprintf(TH_LOG_STREAM, + "# %s: Failed to wait for PID %d (errno: %d)\n", + t->name, t->pid, errno); + return; + } + alarm(0); if (sigaction(SIGALRM, &saved_action, NULL)) { t->exit_code = KSFT_FAIL; @@ -1083,6 +1085,7 @@ void __wait_for_test(struct __test_metadata *t) WTERMSIG(status)); } } else { + t->exit_code = KSFT_FAIL; fprintf(TH_LOG_STREAM, "# %s: Test ended in some other way [%u]\n", t->name, @@ -1218,6 +1221,7 @@ void __run_test(struct __fixture_metadata *f, struct __test_xfail *xfail; char test_name[1024]; const char *diagnostic; + int child; /* reset test struct */ t->exit_code = KSFT_PASS; @@ -1236,15 +1240,16 @@ void __run_test(struct __fixture_metadata *f, fflush(stdout); fflush(stderr); - t->pid = clone3_vfork(); - if (t->pid < 0) { + child = fork(); + if (child < 0) { ksft_print_msg("ERROR SPAWNING TEST CHILD\n"); t->exit_code = KSFT_FAIL; - } else if (t->pid == 0) { + } else if (child == 0) { setpgrp(); t->fn(t, variant); _exit(t->exit_code); } else { + t->pid = child; __wait_for_test(t); } ksft_print_msg(" %4s %s\n", diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c index 2bfed46e0b19..d66336256580 100644 --- a/tools/testing/selftests/net/af_unix/scm_rights.c +++ b/tools/testing/selftests/net/af_unix/scm_rights.c @@ -14,12 +14,12 @@ FIXTURE(scm_rights) { - int fd[16]; + int fd[32]; }; FIXTURE_VARIANT(scm_rights) { - char name[16]; + char name[32]; int type; int flags; bool test_listener; @@ -172,6 +172,8 @@ static void __create_sockets(struct __test_metadata *_metadata, const FIXTURE_VARIANT(scm_rights) *variant, int n) { + ASSERT_LE(n * 2, sizeof(self->fd) / sizeof(self->fd[0])); + if (variant->test_listener) create_listeners(_metadata, self, n); else @@ -283,4 +285,23 @@ TEST_F(scm_rights, cross_edge) close_sockets(8); } +TEST_F(scm_rights, backtrack_from_scc) +{ + create_sockets(10); + + send_fd(0, 1); + send_fd(0, 4); + send_fd(1, 2); + send_fd(2, 3); + send_fd(3, 1); + + send_fd(5, 6); + send_fd(5, 9); + send_fd(6, 7); + send_fd(7, 8); + send_fd(8, 6); + + close_sockets(10); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index bdc03a2097e8..7ea5fb28c93d 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -85,6 +85,7 @@ static bool cfg_rx; static int cfg_runtime_ms = 4200; static int cfg_verbose; static int cfg_waittime_ms = 500; +static int cfg_notification_limit = 32; static bool cfg_zerocopy; static socklen_t cfg_alen; @@ -95,6 +96,7 @@ static char payload[IP_MAXPACKET]; static long packets, bytes, completions, expected_completions; static int zerocopied = -1; static uint32_t next_completion; +static uint32_t sends_since_notify; static unsigned long gettimeofday_ms(void) { @@ -208,6 +210,7 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain) error(1, errno, "send"); if (cfg_verbose && ret != len) fprintf(stderr, "send: ret=%u != %u\n", ret, len); + sends_since_notify++; if (len) { packets++; @@ -435,7 +438,7 @@ static bool do_recv_completion(int fd, int domain) /* Detect notification gaps. These should not happen often, if at all. * Gaps can occur due to drops, reordering and retransmissions. */ - if (lo != next_completion) + if (cfg_verbose && lo != next_completion) fprintf(stderr, "gap: %u..%u does not append to %u\n", lo, hi, next_completion); next_completion = hi + 1; @@ -460,6 +463,7 @@ static bool do_recv_completion(int fd, int domain) static void do_recv_completions(int fd, int domain) { while (do_recv_completion(fd, domain)) {} + sends_since_notify = 0; } /* Wait for all remaining completions on the errqueue */ @@ -549,6 +553,9 @@ static void do_tx(int domain, int type, int protocol) else do_sendmsg(fd, &msg, cfg_zerocopy, domain); + if (cfg_zerocopy && sends_since_notify >= cfg_notification_limit) + do_recv_completions(fd, domain); + while (!do_poll(fd, POLLOUT)) { if (cfg_zerocopy) do_recv_completions(fd, domain); @@ -708,7 +715,7 @@ static void parse_opts(int argc, char **argv) cfg_payload_len = max_payload_len; - while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) { + while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vz")) != -1) { switch (c) { case '4': if (cfg_family != PF_UNSPEC) @@ -736,6 +743,9 @@ static void parse_opts(int argc, char **argv) if (cfg_ifindex == 0) error(1, errno, "invalid iface: %s", optarg); break; + case 'l': + cfg_notification_limit = strtoul(optarg, NULL, 0); + break; case 'm': cfg_cork_mixed = true; break; diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 40dd95228051..3fbabab46958 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -152,7 +152,7 @@ CFLAGS_mips32be = -EB -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \ $(call cc-option,-fno-stack-protector) \ - $(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) + $(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA) LDFLAGS := REPORT ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); f++; print;} /\[SKIPPED\][\r]*$$/{s++} \ diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 94bb6e11c16f..093d0512f4c5 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -64,6 +64,14 @@ static const char *argv0; /* will be used by constructor tests */ static int constructor_test_value; +static const int is_nolibc = +#ifdef NOLIBC + 1 +#else + 0 +#endif +; + /* definition of a series of tests */ struct test { const char *name; /* test name */ @@ -607,7 +615,7 @@ int expect_strne(const char *expr, int llen, const char *cmp) static __attribute__((unused)) int expect_str_buf_eq(size_t expr, const char *buf, size_t val, int llen, const char *cmp) { - llen += printf(" = %lu <%s> ", expr, buf); + llen += printf(" = %lu <%s> ", (unsigned long)expr, buf); if (strcmp(buf, cmp) != 0) { result(llen, FAIL); return 1; @@ -621,6 +629,51 @@ int expect_str_buf_eq(size_t expr, const char *buf, size_t val, int llen, const return 0; } +#define EXPECT_STRTOX(cond, func, input, base, expected, chars, expected_errno) \ + do { if (!(cond)) result(llen, SKIPPED); else ret += expect_strtox(llen, func, input, base, expected, chars, expected_errno); } while (0) + +static __attribute__((unused)) +int expect_strtox(int llen, void *func, const char *input, int base, intmax_t expected, int expected_chars, int expected_errno) +{ + char *endptr; + int actual_errno, actual_chars; + intmax_t r; + + errno = 0; + if (func == strtol) { + r = strtol(input, &endptr, base); + } else if (func == strtoul) { + r = strtoul(input, &endptr, base); + } else { + result(llen, FAIL); + return 1; + } + actual_errno = errno; + actual_chars = endptr - input; + + llen += printf(" %lld = %lld", (long long)expected, (long long)r); + if (r != expected) { + result(llen, FAIL); + return 1; + } + if (expected_chars == -1) { + if (*endptr != '\0') { + result(llen, FAIL); + return 1; + } + } else if (expected_chars != actual_chars) { + result(llen, FAIL); + return 1; + } + if (actual_errno != expected_errno) { + result(llen, FAIL); + return 1; + } + + result(llen, OK); + return 0; +} + /* declare tests based on line numbers. There must be exactly one test per line. */ #define CASE_TEST(name) \ case __LINE__: llen += printf("%d %s", test, #name); @@ -942,6 +995,7 @@ int run_syscall(int min, int max) int ret = 0; void *p1, *p2; int has_gettid = 1; + int has_brk; /* <proc> indicates whether or not /proc is mounted */ proc = stat("/proc", &stat_buf) == 0; @@ -954,6 +1008,9 @@ int run_syscall(int min, int max) has_gettid = __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30); #endif + /* on musl setting brk()/sbrk() always fails */ + has_brk = brk(0) == 0; + for (test = min; test >= 0 && test <= max; test++) { int llen = 0; /* line length */ @@ -969,9 +1026,9 @@ int run_syscall(int min, int max) CASE_TEST(kill_0); EXPECT_SYSZR(1, kill(getpid(), 0)); break; CASE_TEST(kill_CONT); EXPECT_SYSZR(1, kill(getpid(), 0)); break; CASE_TEST(kill_BADPID); EXPECT_SYSER(1, kill(INT_MAX, 0), -1, ESRCH); break; - CASE_TEST(sbrk_0); EXPECT_PTRNE(1, sbrk(0), (void *)-1); break; - CASE_TEST(sbrk); if ((p1 = p2 = sbrk(4096)) != (void *)-1) p2 = sbrk(-4096); EXPECT_SYSZR(1, (p2 == (void *)-1) || p2 == p1); break; - CASE_TEST(brk); EXPECT_SYSZR(1, brk(sbrk(0))); break; + CASE_TEST(sbrk_0); EXPECT_PTRNE(has_brk, sbrk(0), (void *)-1); break; + CASE_TEST(sbrk); if ((p1 = p2 = sbrk(4096)) != (void *)-1) p2 = sbrk(-4096); EXPECT_SYSZR(has_brk, (p2 == (void *)-1) || p2 == p1); break; + CASE_TEST(brk); EXPECT_SYSZR(has_brk, brk(sbrk(0))); break; CASE_TEST(chdir_root); EXPECT_SYSZR(1, chdir("/")); chdir(getenv("PWD")); break; CASE_TEST(chdir_dot); EXPECT_SYSZR(1, chdir(".")); break; CASE_TEST(chdir_blah); EXPECT_SYSER(1, chdir("/blah"), -1, ENOENT); break; @@ -1076,19 +1133,17 @@ int run_stdlib(int min, int max) CASE_TEST(strchr_foobar_z); EXPECT_STRZR(1, strchr("foobar", 'z')); break; CASE_TEST(strrchr_foobar_o); EXPECT_STREQ(1, strrchr("foobar", 'o'), "obar"); break; CASE_TEST(strrchr_foobar_z); EXPECT_STRZR(1, strrchr("foobar", 'z')); break; -#ifdef NOLIBC - CASE_TEST(strlcat_0); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 0), buf, 3, "test"); break; - CASE_TEST(strlcat_1); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 1), buf, 4, "test"); break; - CASE_TEST(strlcat_5); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 5), buf, 7, "test"); break; - CASE_TEST(strlcat_6); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 6), buf, 7, "testb"); break; - CASE_TEST(strlcat_7); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 7), buf, 7, "testba"); break; - CASE_TEST(strlcat_8); EXPECT_STRBUFEQ(1, strlcat(buf, "bar", 8), buf, 7, "testbar"); break; - CASE_TEST(strlcpy_0); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 0), buf, 3, "test"); break; - CASE_TEST(strlcpy_1); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 1), buf, 3, ""); break; - CASE_TEST(strlcpy_2); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 2), buf, 3, "b"); break; - CASE_TEST(strlcpy_3); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 3), buf, 3, "ba"); break; - CASE_TEST(strlcpy_4); EXPECT_STRBUFEQ(1, strlcpy(buf, "bar", 4), buf, 3, "bar"); break; -#endif + CASE_TEST(strlcat_0); EXPECT_STRBUFEQ(is_nolibc, strlcat(buf, "bar", 0), buf, 3, "test"); break; + CASE_TEST(strlcat_1); EXPECT_STRBUFEQ(is_nolibc, strlcat(buf, "bar", 1), buf, 4, "test"); break; + CASE_TEST(strlcat_5); EXPECT_STRBUFEQ(is_nolibc, strlcat(buf, "bar", 5), buf, 7, "test"); break; + CASE_TEST(strlcat_6); EXPECT_STRBUFEQ(is_nolibc, strlcat(buf, "bar", 6), buf, 7, "testb"); break; + CASE_TEST(strlcat_7); EXPECT_STRBUFEQ(is_nolibc, strlcat(buf, "bar", 7), buf, 7, "testba"); break; + CASE_TEST(strlcat_8); EXPECT_STRBUFEQ(is_nolibc, strlcat(buf, "bar", 8), buf, 7, "testbar"); break; + CASE_TEST(strlcpy_0); EXPECT_STRBUFEQ(is_nolibc, strlcpy(buf, "bar", 0), buf, 3, "test"); break; + CASE_TEST(strlcpy_1); EXPECT_STRBUFEQ(is_nolibc, strlcpy(buf, "bar", 1), buf, 3, ""); break; + CASE_TEST(strlcpy_2); EXPECT_STRBUFEQ(is_nolibc, strlcpy(buf, "bar", 2), buf, 3, "b"); break; + CASE_TEST(strlcpy_3); EXPECT_STRBUFEQ(is_nolibc, strlcpy(buf, "bar", 3), buf, 3, "ba"); break; + CASE_TEST(strlcpy_4); EXPECT_STRBUFEQ(is_nolibc, strlcpy(buf, "bar", 4), buf, 3, "bar"); break; CASE_TEST(memcmp_20_20); EXPECT_EQ(1, memcmp("aaa\x20", "aaa\x20", 4), 0); break; CASE_TEST(memcmp_20_60); EXPECT_LT(1, memcmp("aaa\x20", "aaa\x60", 4), 0); break; CASE_TEST(memcmp_60_20); EXPECT_GT(1, memcmp("aaa\x60", "aaa\x20", 4), 0); break; @@ -1139,6 +1194,26 @@ int run_stdlib(int min, int max) CASE_TEST(limit_ptrdiff_min); EXPECT_EQ(1, PTRDIFF_MIN, sizeof(long) == 8 ? (ptrdiff_t) 0x8000000000000000LL : (ptrdiff_t) 0x80000000); break; CASE_TEST(limit_ptrdiff_max); EXPECT_EQ(1, PTRDIFF_MAX, sizeof(long) == 8 ? (ptrdiff_t) 0x7fffffffffffffffLL : (ptrdiff_t) 0x7fffffff); break; CASE_TEST(limit_size_max); EXPECT_EQ(1, SIZE_MAX, sizeof(long) == 8 ? (size_t) 0xffffffffffffffffULL : (size_t) 0xffffffffU); break; + CASE_TEST(strtol_simple); EXPECT_STRTOX(1, strtol, "35", 10, 35, -1, 0); break; + CASE_TEST(strtol_positive); EXPECT_STRTOX(1, strtol, "+35", 10, 35, -1, 0); break; + CASE_TEST(strtol_negative); EXPECT_STRTOX(1, strtol, "-35", 10, -35, -1, 0); break; + CASE_TEST(strtol_hex_auto); EXPECT_STRTOX(1, strtol, "0xFF", 0, 255, -1, 0); break; + CASE_TEST(strtol_base36); EXPECT_STRTOX(1, strtol, "12yZ", 36, 50507, -1, 0); break; + CASE_TEST(strtol_cutoff); EXPECT_STRTOX(1, strtol, "1234567890", 8, 342391, 7, 0); break; + CASE_TEST(strtol_octal_auto); EXPECT_STRTOX(1, strtol, "011", 0, 9, -1, 0); break; + CASE_TEST(strtol_hex_00); EXPECT_STRTOX(1, strtol, "0x00", 16, 0, -1, 0); break; + CASE_TEST(strtol_hex_FF); EXPECT_STRTOX(1, strtol, "FF", 16, 255, -1, 0); break; + CASE_TEST(strtol_hex_ff); EXPECT_STRTOX(1, strtol, "ff", 16, 255, -1, 0); break; + CASE_TEST(strtol_hex_prefix); EXPECT_STRTOX(1, strtol, "0xFF", 16, 255, -1, 0); break; + CASE_TEST(strtol_trailer); EXPECT_STRTOX(1, strtol, "35foo", 10, 35, 2, 0); break; + CASE_TEST(strtol_overflow); EXPECT_STRTOX(1, strtol, "0x8000000000000000", 16, LONG_MAX, -1, ERANGE); break; + CASE_TEST(strtol_underflow); EXPECT_STRTOX(1, strtol, "-0x8000000000000001", 16, LONG_MIN, -1, ERANGE); break; + CASE_TEST(strtoul_negative); EXPECT_STRTOX(1, strtoul, "-0x1", 16, ULONG_MAX, 4, 0); break; + CASE_TEST(strtoul_overflow); EXPECT_STRTOX(1, strtoul, "0x10000000000000000", 16, ULONG_MAX, -1, ERANGE); break; + CASE_TEST(strerror_success); EXPECT_STREQ(is_nolibc, strerror(0), "errno=0"); break; + CASE_TEST(strerror_EINVAL); EXPECT_STREQ(is_nolibc, strerror(EINVAL), "errno=22"); break; + CASE_TEST(strerror_int_max); EXPECT_STREQ(is_nolibc, strerror(INT_MAX), "errno=2147483647"); break; + CASE_TEST(strerror_int_min); EXPECT_STREQ(is_nolibc, strerror(INT_MIN), "errno=-2147483648"); break; case __LINE__: return ret; /* must be last */ diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index c0a5a7cea9fa..0446e6326a40 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -15,9 +15,10 @@ download_location="${cache_dir}/crosstools/" build_location="$(realpath "${cache_dir}"/nolibc-tests/)" perform_download=0 test_mode=system +CFLAGS_EXTRA="-Werror" archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv s390 loongarch" -TEMP=$(getopt -o 'j:d:c:b:a:m:ph' -n "$0" -- "$@") +TEMP=$(getopt -o 'j:d:c:b:a:m:peh' -n "$0" -- "$@") eval set -- "$TEMP" unset TEMP @@ -40,6 +41,7 @@ Options: -a [ARCH] Host architecture of toolchains to use (default: ${hostarch}) -b [DIR] Build location (default: ${build_location}) -m [MODE] Test mode user/system (default: ${test_mode}) + -e Disable -Werror EOF } @@ -66,6 +68,9 @@ while true; do '-m') test_mode="$2" shift 2; continue ;; + '-e') + CFLAGS_EXTRA="" + shift; continue ;; '-h') print_usage exit 0 @@ -153,7 +158,7 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" - swallow_output "${MAKE[@]}" "$test_target" V=1 + swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" V=1 cp run.out run.out."${arch}" "${MAKE[@]}" report | grep passed } diff --git a/tools/testing/selftests/powerpc/flags.mk b/tools/testing/selftests/powerpc/flags.mk index b909bee3cb2a..abb9e58d95c4 100644 --- a/tools/testing/selftests/powerpc/flags.mk +++ b/tools/testing/selftests/powerpc/flags.mk @@ -5,8 +5,5 @@ GIT_VERSION := $(shell git describe --always --long --dirty || echo "unknown") export GIT_VERSION endif -ifeq ($(CFLAGS),) -CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(selfdir)/powerpc/include $(CFLAGS) +CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(selfdir)/powerpc/include $(USERCFLAGS) export CFLAGS -endif - diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index c7686fb6641a..55315ed695f4 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -291,11 +291,30 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param return ret; } +static bool arch_supports_noncont_cat(const struct resctrl_test *test) +{ + unsigned int eax, ebx, ecx, edx; + + /* AMD always supports non-contiguous CBM. */ + if (get_vendor() == ARCH_AMD) + return true; + + /* Intel support for non-contiguous CBM needs to be discovered. */ + if (!strcmp(test->resource, "L3")) + __cpuid_count(0x10, 1, eax, ebx, ecx, edx); + else if (!strcmp(test->resource, "L2")) + __cpuid_count(0x10, 2, eax, ebx, ecx, edx); + else + return false; + + return ((ecx >> 3) & 1); +} + static int noncont_cat_run_test(const struct resctrl_test *test, const struct user_params *uparams) { unsigned long full_cache_mask, cont_mask, noncont_mask; - unsigned int eax, ebx, ecx, edx, sparse_masks; + unsigned int sparse_masks; int bit_center, ret; char schemata[64]; @@ -304,15 +323,8 @@ static int noncont_cat_run_test(const struct resctrl_test *test, if (ret) return ret; - if (!strcmp(test->resource, "L3")) - __cpuid_count(0x10, 1, eax, ebx, ecx, edx); - else if (!strcmp(test->resource, "L2")) - __cpuid_count(0x10, 2, eax, ebx, ecx, edx); - else - return -EINVAL; - - if (sparse_masks != ((ecx >> 3) & 1)) { - ksft_print_msg("CPUID output doesn't match 'sparse_masks' file content!\n"); + if (arch_supports_noncont_cat(test) != sparse_masks) { + ksft_print_msg("Hardware and kernel differ on non-contiguous CBM support!\n"); return 1; } diff --git a/tools/testing/selftests/riscv/sigreturn/sigreturn.c b/tools/testing/selftests/riscv/sigreturn/sigreturn.c index 62397d5934f1..ed351a1cb917 100644 --- a/tools/testing/selftests/riscv/sigreturn/sigreturn.c +++ b/tools/testing/selftests/riscv/sigreturn/sigreturn.c @@ -51,7 +51,7 @@ static int vector_sigreturn(int data, void (*handler)(int, siginfo_t *, void *)) asm(".option push \n\ .option arch, +v \n\ - vsetivli x0, 1, e32, ta, ma \n\ + vsetivli x0, 1, e32, m1, ta, ma \n\ vmv.s.x v0, %1 \n\ # Generate SIGSEGV \n\ lw a0, 0(x0) \n\ diff --git a/tools/testing/selftests/timens/exec.c b/tools/testing/selftests/timens/exec.c index e40dc5be2f66..d12ff955de0d 100644 --- a/tools/testing/selftests/timens/exec.c +++ b/tools/testing/selftests/timens/exec.c @@ -30,7 +30,7 @@ int main(int argc, char *argv[]) for (i = 0; i < 2; i++) { _gettime(CLOCK_MONOTONIC, &tst, i); - if (abs(tst.tv_sec - now.tv_sec) > 5) + if (labs(tst.tv_sec - now.tv_sec) > 5) return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec); } return 0; @@ -50,7 +50,7 @@ int main(int argc, char *argv[]) for (i = 0; i < 2; i++) { _gettime(CLOCK_MONOTONIC, &tst, i); - if (abs(tst.tv_sec - now.tv_sec) > 5) + if (labs(tst.tv_sec - now.tv_sec) > 5) return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec); } @@ -70,7 +70,7 @@ int main(int argc, char *argv[]) /* Check that a child process is in the new timens. */ for (i = 0; i < 2; i++) { _gettime(CLOCK_MONOTONIC, &tst, i); - if (abs(tst.tv_sec - now.tv_sec - OFFSET) > 5) + if (labs(tst.tv_sec - now.tv_sec - OFFSET) > 5) return pr_fail("%ld %ld\n", now.tv_sec + OFFSET, tst.tv_sec); } diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c index 5e7f0051bd7b..5b939f59dfa4 100644 --- a/tools/testing/selftests/timens/timer.c +++ b/tools/testing/selftests/timens/timer.c @@ -56,7 +56,7 @@ int run_test(int clockid, struct timespec now) return pr_perror("timerfd_gettime"); elapsed = new_value.it_value.tv_sec; - if (abs(elapsed - 3600) > 60) { + if (llabs(elapsed - 3600) > 60) { ksft_test_result_fail("clockid: %d elapsed: %lld\n", clockid, elapsed); return 1; diff --git a/tools/testing/selftests/timens/timerfd.c b/tools/testing/selftests/timens/timerfd.c index 9edd43d6b2c1..a4196bbd6e33 100644 --- a/tools/testing/selftests/timens/timerfd.c +++ b/tools/testing/selftests/timens/timerfd.c @@ -61,7 +61,7 @@ int run_test(int clockid, struct timespec now) return pr_perror("timerfd_gettime(%d)", clockid); elapsed = new_value.it_value.tv_sec; - if (abs(elapsed - 3600) > 60) { + if (llabs(elapsed - 3600) > 60) { ksft_test_result_fail("clockid: %d elapsed: %lld\n", clockid, elapsed); return 1; diff --git a/tools/testing/selftests/timens/vfork_exec.c b/tools/testing/selftests/timens/vfork_exec.c index beb7614941fb..5b8907bf451d 100644 --- a/tools/testing/selftests/timens/vfork_exec.c +++ b/tools/testing/selftests/timens/vfork_exec.c @@ -32,7 +32,7 @@ static void *tcheck(void *_args) for (i = 0; i < 2; i++) { _gettime(CLOCK_MONOTONIC, &tst, i); - if (abs(tst.tv_sec - now->tv_sec) > 5) { + if (labs(tst.tv_sec - now->tv_sec) > 5) { pr_fail("%s: in-thread: unexpected value: %ld (%ld)\n", args->tst_name, tst.tv_sec, now->tv_sec); return (void *)1UL; @@ -64,7 +64,7 @@ static int check(char *tst_name, struct timespec *now) for (i = 0; i < 2; i++) { _gettime(CLOCK_MONOTONIC, &tst, i); - if (abs(tst.tv_sec - now->tv_sec) > 5) + if (labs(tst.tv_sec - now->tv_sec) > 5) return pr_fail("%s: unexpected value: %ld (%ld)\n", tst_name, tst.tv_sec, now->tv_sec); } diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index d53a4d8008f9..98d8ba2afa00 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -1,35 +1,30 @@ # SPDX-License-Identifier: GPL-2.0 -include ../lib.mk - uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu -TEST_GEN_PROGS += $(OUTPUT)/vdso_test_abi -TEST_GEN_PROGS += $(OUTPUT)/vdso_test_clock_getres +TEST_GEN_PROGS := vdso_test_gettimeofday +TEST_GEN_PROGS += vdso_test_getcpu +TEST_GEN_PROGS += vdso_test_abi +TEST_GEN_PROGS += vdso_test_clock_getres ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64)) -TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 +TEST_GEN_PROGS += vdso_standalone_test_x86 endif -TEST_GEN_PROGS += $(OUTPUT)/vdso_test_correctness +TEST_GEN_PROGS += vdso_test_correctness CFLAGS := -std=gnu99 -CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector -LDFLAGS_vdso_test_correctness := -ldl + ifeq ($(CONFIG_X86_32),y) LDLIBS += -lgcc_s endif -all: $(TEST_GEN_PROGS) +include ../lib.mk $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c $(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c + $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c - $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \ - vdso_standalone_test_x86.c parse_vdso.c \ - -o $@ +$(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector + $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c - $(CC) $(CFLAGS) \ - vdso_test_correctness.c \ - -o $@ \ - $(LDFLAGS_vdso_test_correctness) +$(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 413f75620a35..4ae417372e9e 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -55,14 +55,20 @@ static struct vdso_info ELF(Verdef) *verdef; } vdso_info; -/* Straight from the ELF specification. */ -static unsigned long elf_hash(const unsigned char *name) +/* + * Straight from the ELF specification...and then tweaked slightly, in order to + * avoid a few clang warnings. + */ +static unsigned long elf_hash(const char *name) { unsigned long h = 0, g; - while (*name) + const unsigned char *uch_name = (const unsigned char *)name; + + while (*uch_name) { - h = (h << 4) + *name++; - if (g = h & 0xf0000000) + h = (h << 4) + *uch_name++; + g = h & 0xf0000000; + if (g) h ^= g >> 24; h &= ~g; } diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c index 8a44ff973ee1..27f6fdf11969 100644 --- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c +++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c @@ -18,7 +18,7 @@ #include "parse_vdso.h" -/* We need a libc functions... */ +/* We need some libc functions... */ int strcmp(const char *a, const char *b) { /* This implementation is buggy: it never returns -1. */ @@ -34,6 +34,20 @@ int strcmp(const char *a, const char *b) return 0; } +/* + * The clang build needs this, although gcc does not. + * Stolen from lib/string.c. + */ +void *memcpy(void *dest, const void *src, size_t count) +{ + char *tmp = dest; + const char *s = src; + + while (count--) + *tmp++ = *s++; + return dest; +} + /* ...and two syscalls. This is x86-specific. */ static inline long x86_syscall3(long nr, long a0, long a1, long a2) { @@ -70,7 +84,7 @@ void to_base10(char *lastdig, time_t n) } } -__attribute__((externally_visible)) void c_main(void **stack) +void c_main(void **stack) { /* Parse the stack */ long argc = (long)*stack; diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index e95bd56b332f..35856b11c143 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -109,9 +109,9 @@ KERNEL_ARCH := x86_64 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(HOST_ARCH),$(ARCH)) -QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi +QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off,acpi=off else -QEMU_MACHINE := -cpu max -machine microvm -no-acpi +QEMU_MACHINE := -cpu max -machine microvm,acpi=off endif else ifeq ($(ARCH),i686) CHOST := i686-linux-musl @@ -120,9 +120,9 @@ KERNEL_ARCH := x86 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH)) -QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi +QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off,acpi=off else -QEMU_MACHINE := -cpu coreduo -machine microvm -no-acpi +QEMU_MACHINE := -cpu coreduo -machine microvm,acpi=off endif else ifeq ($(ARCH),mips64) CHOST := mips64-linux-musl |