aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst9
-rw-r--r--Documentation/security/landlock.rst14
-rw-r--r--Documentation/userspace-api/landlock.rst90
-rw-r--r--arch/x86/kvm/lapic.c29
-rw-r--r--arch/x86/kvm/svm/sev.c15
-rw-r--r--arch/x86/kvm/vmx/nested.c30
-rw-r--r--arch/x86/kvm/vmx/vmx.c6
-rw-r--r--drivers/md/dm-bufio.c12
-rw-r--r--drivers/md/dm-cache-background-tracker.c25
-rw-r--r--drivers/md/dm-cache-background-tracker.h8
-rw-r--r--drivers/md/dm-cache-target.c25
-rw-r--r--drivers/vdpa/ifcvf/ifcvf_base.c2
-rw-r--r--drivers/vdpa/mlx5/core/mr.c8
-rw-r--r--drivers/vdpa/mlx5/net/mlx5_vnet.c21
-rw-r--r--drivers/vdpa/solidrun/snet_main.c14
-rw-r--r--drivers/vdpa/virtio_pci/vp_vdpa.c10
-rw-r--r--drivers/virtio/virtio_pci_common.c24
-rw-r--r--drivers/virtio/virtio_pci_common.h1
-rw-r--r--drivers/virtio/virtio_pci_modern.c12
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/gcinode.c4
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/ocfs2/super.c13
-rw-r--r--include/linux/memcontrol.h12
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/ext.c46
-rw-r--r--kernel/sched/sched.h5
-rw-r--r--mm/gup.c116
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_alloc.c15
-rw-r--r--mm/page_io.c16
-rw-r--r--mm/swap.c14
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/zswap.c6
-rw-r--r--net/vmw_vsock/virtio_transport_common.c1
-rw-r--r--samples/landlock/sandboxer.c112
-rw-r--r--security/integrity/evm/evm_main.c3
-rw-r--r--security/integrity/ima/ima_template_lib.c14
-rw-r--r--security/integrity/integrity.h4
-rw-r--r--security/landlock/fs.c31
-rw-r--r--security/landlock/net.c28
-rw-r--r--security/landlock/ruleset.h74
-rw-r--r--security/landlock/syscalls.c2
-rw-r--r--security/landlock/task.c18
-rw-r--r--tools/sched_ext/scx_show_state.py2
-rw-r--r--tools/testing/selftests/kvm/Makefile10
-rw-r--r--tools/testing/selftests/kvm/guest_memfd_test.c2
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/vmx.c2
-rw-r--r--tools/testing/selftests/kvm/memslot_perf_test.c2
-rw-r--r--tools/testing/selftests/mm/hugetlb_dio.c7
-rw-r--r--tools/virtio/vringh_test.c2
56 files changed, 592 insertions, 358 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 69af2173555f..6d02168d78be 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1599,6 +1599,15 @@ The following nested keys are defined.
pglazyfreed (npn)
Amount of reclaimed lazyfree pages
+ swpin_zero
+ Number of pages swapped into memory and filled with zero, where I/O
+ was optimized out because the page content was detected to be zero
+ during swapout.
+
+ swpout_zero
+ Number of zero-filled pages swapped out with I/O skipped due to the
+ content being detected as zero.
+
zswpin
Number of pages moved in to memory from zswap.
diff --git a/Documentation/security/landlock.rst b/Documentation/security/landlock.rst
index 36f26501fd15..59ecdb1c0d4d 100644
--- a/Documentation/security/landlock.rst
+++ b/Documentation/security/landlock.rst
@@ -11,18 +11,18 @@ Landlock LSM: kernel documentation
Landlock's goal is to create scoped access-control (i.e. sandboxing). To
harden a whole system, this feature should be available to any process,
-including unprivileged ones. Because such process may be compromised or
+including unprivileged ones. Because such a process may be compromised or
backdoored (i.e. untrusted), Landlock's features must be safe to use from the
kernel and other processes point of view. Landlock's interface must therefore
expose a minimal attack surface.
Landlock is designed to be usable by unprivileged processes while following the
system security policy enforced by other access control mechanisms (e.g. DAC,
-LSM). Indeed, a Landlock rule shall not interfere with other access-controls
-enforced on the system, only add more restrictions.
+LSM). A Landlock rule shall not interfere with other access-controls enforced
+on the system, only add more restrictions.
Any user can enforce Landlock rulesets on their processes. They are merged and
-evaluated according to the inherited ones in a way that ensures that only more
+evaluated against inherited rulesets in a way that ensures that only more
constraints can be added.
User space documentation can be found here:
@@ -43,7 +43,7 @@ Guiding principles for safe access controls
only impact the processes requesting them.
* Resources (e.g. file descriptors) directly obtained from the kernel by a
sandboxed process shall retain their scoped accesses (at the time of resource
- acquisition) whatever process use them.
+ acquisition) whatever process uses them.
Cf. `File descriptor access rights`_.
Design choices
@@ -71,7 +71,7 @@ the same results, when they are executed under the same Landlock domain.
Taking the ``LANDLOCK_ACCESS_FS_TRUNCATE`` right as an example, it may be
allowed to open a file for writing without being allowed to
:manpage:`ftruncate` the resulting file descriptor if the related file
-hierarchy doesn't grant such access right. The following sequences of
+hierarchy doesn't grant that access right. The following sequences of
operations have the same semantic and should then have the same result:
* ``truncate(path);``
@@ -81,7 +81,7 @@ Similarly to file access modes (e.g. ``O_RDWR``), Landlock access rights
attached to file descriptors are retained even if they are passed between
processes (e.g. through a Unix domain socket). Such access rights will then be
enforced even if the receiving process is not sandboxed by Landlock. Indeed,
-this is required to keep a consistent access control over the whole system, and
+this is required to keep access controls consistent over the whole system, and
this avoids unattended bypasses through file descriptor passing (i.e. confused
deputy attack).
diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index c8d3e46badc5..d639c61cb472 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -8,13 +8,13 @@ Landlock: unprivileged access control
=====================================
:Author: Mickaël Salaün
-:Date: September 2024
+:Date: October 2024
-The goal of Landlock is to enable to restrict ambient rights (e.g. global
+The goal of Landlock is to enable restriction of ambient rights (e.g. global
filesystem or network access) for a set of processes. Because Landlock
-is a stackable LSM, it makes possible to create safe security sandboxes as new
-security layers in addition to the existing system-wide access-controls. This
-kind of sandbox is expected to help mitigate the security impact of bugs or
+is a stackable LSM, it makes it possible to create safe security sandboxes as
+new security layers in addition to the existing system-wide access-controls.
+This kind of sandbox is expected to help mitigate the security impact of bugs or
unexpected/malicious behaviors in user space applications. Landlock empowers
any process, including unprivileged ones, to securely restrict themselves.
@@ -86,8 +86,8 @@ to be explicit about the denied-by-default access rights.
LANDLOCK_SCOPE_SIGNAL,
};
-Because we may not know on which kernel version an application will be
-executed, it is safer to follow a best-effort security approach. Indeed, we
+Because we may not know which kernel version an application will be executed
+on, it is safer to follow a best-effort security approach. Indeed, we
should try to protect users as much as possible whatever the kernel they are
using.
@@ -129,7 +129,7 @@ version, and only use the available subset of access rights:
LANDLOCK_SCOPE_SIGNAL);
}
-This enables to create an inclusive ruleset that will contain our rules.
+This enables the creation of an inclusive ruleset that will contain our rules.
.. code-block:: c
@@ -219,42 +219,41 @@ If the ``landlock_restrict_self`` system call succeeds, the current thread is
now restricted and this policy will be enforced on all its subsequently created
children as well. Once a thread is landlocked, there is no way to remove its
security policy; only adding more restrictions is allowed. These threads are
-now in a new Landlock domain, merge of their parent one (if any) with the new
-ruleset.
+now in a new Landlock domain, which is a merger of their parent one (if any)
+with the new ruleset.
Full working code can be found in `samples/landlock/sandboxer.c`_.
Good practices
--------------
-It is recommended setting access rights to file hierarchy leaves as much as
+It is recommended to set access rights to file hierarchy leaves as much as
possible. For instance, it is better to be able to have ``~/doc/`` as a
read-only hierarchy and ``~/tmp/`` as a read-write hierarchy, compared to
``~/`` as a read-only hierarchy and ``~/tmp/`` as a read-write hierarchy.
Following this good practice leads to self-sufficient hierarchies that do not
depend on their location (i.e. parent directories). This is particularly
relevant when we want to allow linking or renaming. Indeed, having consistent
-access rights per directory enables to change the location of such directory
+access rights per directory enables changing the location of such directories
without relying on the destination directory access rights (except those that
are required for this operation, see ``LANDLOCK_ACCESS_FS_REFER``
documentation).
Having self-sufficient hierarchies also helps to tighten the required access
rights to the minimal set of data. This also helps avoid sinkhole directories,
-i.e. directories where data can be linked to but not linked from. However,
+i.e. directories where data can be linked to but not linked from. However,
this depends on data organization, which might not be controlled by developers.
In this case, granting read-write access to ``~/tmp/``, instead of write-only
-access, would potentially allow to move ``~/tmp/`` to a non-readable directory
+access, would potentially allow moving ``~/tmp/`` to a non-readable directory
and still keep the ability to list the content of ``~/tmp/``.
Layers of file path access rights
---------------------------------
Each time a thread enforces a ruleset on itself, it updates its Landlock domain
-with a new layer of policy. Indeed, this complementary policy is stacked with
-the potentially other rulesets already restricting this thread. A sandboxed
-thread can then safely add more constraints to itself with a new enforced
-ruleset.
+with a new layer of policy. This complementary policy is stacked with any
+other rulesets potentially already restricting this thread. A sandboxed thread
+can then safely add more constraints to itself with a new enforced ruleset.
One policy layer grants access to a file path if at least one of its rules
encountered on the path grants the access. A sandboxed thread can only access
@@ -265,7 +264,7 @@ etc.).
Bind mounts and OverlayFS
-------------------------
-Landlock enables to restrict access to file hierarchies, which means that these
+Landlock enables restricting access to file hierarchies, which means that these
access rights can be propagated with bind mounts (cf.
Documentation/filesystems/sharedsubtree.rst) but not with
Documentation/filesystems/overlayfs.rst.
@@ -278,21 +277,21 @@ access to multiple file hierarchies at the same time, whether these hierarchies
are the result of bind mounts or not.
An OverlayFS mount point consists of upper and lower layers. These layers are
-combined in a merge directory, result of the mount point. This merge hierarchy
-may include files from the upper and lower layers, but modifications performed
-on the merge hierarchy only reflects on the upper layer. From a Landlock
-policy point of view, each OverlayFS layers and merge hierarchies are
-standalone and contains their own set of files and directories, which is
-different from bind mounts. A policy restricting an OverlayFS layer will not
-restrict the resulted merged hierarchy, and vice versa. Landlock users should
-then only think about file hierarchies they want to allow access to, regardless
-of the underlying filesystem.
+combined in a merge directory, and that merged directory becomes available at
+the mount point. This merge hierarchy may include files from the upper and
+lower layers, but modifications performed on the merge hierarchy only reflect
+on the upper layer. From a Landlock policy point of view, all OverlayFS layers
+and merge hierarchies are standalone and each contains their own set of files
+and directories, which is different from bind mounts. A policy restricting an
+OverlayFS layer will not restrict the resulted merged hierarchy, and vice versa.
+Landlock users should then only think about file hierarchies they want to allow
+access to, regardless of the underlying filesystem.
Inheritance
-----------
Every new thread resulting from a :manpage:`clone(2)` inherits Landlock domain
-restrictions from its parent. This is similar to the seccomp inheritance (cf.
+restrictions from its parent. This is similar to seccomp inheritance (cf.
Documentation/userspace-api/seccomp_filter.rst) or any other LSM dealing with
task's :manpage:`credentials(7)`. For instance, one process's thread may apply
Landlock rules to itself, but they will not be automatically applied to other
@@ -311,8 +310,8 @@ Ptrace restrictions
A sandboxed process has less privileges than a non-sandboxed process and must
then be subject to additional restrictions when manipulating another process.
To be allowed to use :manpage:`ptrace(2)` and related syscalls on a target
-process, a sandboxed process should have a subset of the target process rules,
-which means the tracee must be in a sub-domain of the tracer.
+process, a sandboxed process should have a superset of the target process's
+access rights, which means the tracee must be in a sub-domain of the tracer.
IPC scoping
-----------
@@ -322,7 +321,7 @@ interactions between sandboxes. Each Landlock domain can be explicitly scoped
for a set of actions by specifying it on a ruleset. For example, if a
sandboxed process should not be able to :manpage:`connect(2)` to a
non-sandboxed process through abstract :manpage:`unix(7)` sockets, we can
-specify such restriction with ``LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET``.
+specify such a restriction with ``LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET``.
Moreover, if a sandboxed process should not be able to send a signal to a
non-sandboxed process, we can specify this restriction with
``LANDLOCK_SCOPE_SIGNAL``.
@@ -394,7 +393,7 @@ Backward and forward compatibility
Landlock is designed to be compatible with past and future versions of the
kernel. This is achieved thanks to the system call attributes and the
associated bitflags, particularly the ruleset's ``handled_access_fs``. Making
-handled access right explicit enables the kernel and user space to have a clear
+handled access rights explicit enables the kernel and user space to have a clear
contract with each other. This is required to make sure sandboxing will not
get stricter with a system update, which could break applications.
@@ -563,33 +562,34 @@ always allowed when using a kernel that only supports the first or second ABI.
Starting with the Landlock ABI version 3, it is now possible to securely control
truncation thanks to the new ``LANDLOCK_ACCESS_FS_TRUNCATE`` access right.
-Network support (ABI < 4)
--------------------------
+TCP bind and connect (ABI < 4)
+------------------------------
Starting with the Landlock ABI version 4, it is now possible to restrict TCP
bind and connect actions to only a set of allowed ports thanks to the new
``LANDLOCK_ACCESS_NET_BIND_TCP`` and ``LANDLOCK_ACCESS_NET_CONNECT_TCP``
access rights.
-IOCTL (ABI < 5)
----------------
+Device IOCTL (ABI < 5)
+----------------------
IOCTL operations could not be denied before the fifth Landlock ABI, so
:manpage:`ioctl(2)` is always allowed when using a kernel that only supports an
earlier ABI.
Starting with the Landlock ABI version 5, it is possible to restrict the use of
-:manpage:`ioctl(2)` using the new ``LANDLOCK_ACCESS_FS_IOCTL_DEV`` right.
+:manpage:`ioctl(2)` on character and block devices using the new
+``LANDLOCK_ACCESS_FS_IOCTL_DEV`` right.
-Abstract UNIX socket scoping (ABI < 6)
---------------------------------------
+Abstract UNIX socket (ABI < 6)
+------------------------------
Starting with the Landlock ABI version 6, it is possible to restrict
connections to an abstract :manpage:`unix(7)` socket by setting
``LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET`` to the ``scoped`` ruleset attribute.
-Signal scoping (ABI < 6)
-------------------------
+Signal (ABI < 6)
+----------------
Starting with the Landlock ABI version 6, it is possible to restrict
:manpage:`signal(7)` sending by setting ``LANDLOCK_SCOPE_SIGNAL`` to the
@@ -605,9 +605,9 @@ Build time configuration
Landlock was first introduced in Linux 5.13 but it must be configured at build
time with ``CONFIG_SECURITY_LANDLOCK=y``. Landlock must also be enabled at boot
-time as the other security modules. The list of security modules enabled by
+time like other security modules. The list of security modules enabled by
default is set with ``CONFIG_LSM``. The kernel configuration should then
-contains ``CONFIG_LSM=landlock,[...]`` with ``[...]`` as the list of other
+contain ``CONFIG_LSM=landlock,[...]`` with ``[...]`` as the list of other
potentially useful security modules for the running system (see the
``CONFIG_LSM`` help).
@@ -669,7 +669,7 @@ Questions and answers
What about user space sandbox managers?
---------------------------------------
-Using user space process to enforce restrictions on kernel resources can lead
+Using user space processes to enforce restrictions on kernel resources can lead
to race conditions or inconsistent evaluations (i.e. `Incorrect mirroring of
the OS code and state
<https://www.ndss-symposium.org/ndss2003/traps-and-pitfalls-practical-problems-system-call-interposition-based-security-tools/>`_).
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2098dc689088..95c6beb8ce27 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2629,19 +2629,26 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic->apicv_active) {
- /* irr_pending is always true when apicv is activated. */
- apic->irr_pending = true;
+ /*
+ * When APICv is enabled, KVM must always search the IRR for a pending
+ * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU
+ * isn't running. If APICv is disabled, KVM _should_ search the IRR
+ * for a pending IRQ. But KVM currently doesn't ensure *all* hardware,
+ * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching
+ * the IRR at this time could race with IRQ delivery from hardware that
+ * still sees APICv as being enabled.
+ *
+ * FIXME: Ensure other vCPUs and devices observe the change in APICv
+ * state prior to updating KVM's metadata caches, so that KVM
+ * can safely search the IRR and set irr_pending accordingly.
+ */
+ apic->irr_pending = true;
+
+ if (apic->apicv_active)
apic->isr_count = 1;
- } else {
- /*
- * Don't clear irr_pending, searching the IRR can race with
- * updates from the CPU as APICv is still active from hardware's
- * perspective. The flag will be cleared as appropriate when
- * KVM injects the interrupt.
- */
+ else
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
- }
+
apic->highest_isr_cache = -1;
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0b851ef937f2..fb854cf20ac3 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -450,8 +450,11 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
goto e_free;
/* This needs to happen after SEV/SNP firmware initialization. */
- if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm))
- goto e_free;
+ if (vm_type == KVM_X86_SNP_VM) {
+ ret = snp_guest_req_init(kvm);
+ if (ret)
+ goto e_free;
+ }
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
@@ -2212,10 +2215,6 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (sev->snp_context)
return -EINVAL;
- sev->snp_context = snp_context_create(kvm, argp);
- if (!sev->snp_context)
- return -ENOTTY;
-
if (params.flags)
return -EINVAL;
@@ -2230,6 +2229,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
return -EINVAL;
+ sev->snp_context = snp_context_create(kvm, argp);
+ if (!sev->snp_context)
+ return -ENOTTY;
+
start.gctx_paddr = __psp_pa(sev->snp_context);
start.policy = params.policy;
memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a8e7bc04d9bf..931a7361c30f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1197,11 +1197,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
/*
- * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
- * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
- * full TLB flush from the guest's perspective. This is required even
- * if VPID is disabled in the host as KVM may need to synchronize the
- * MMU in response to the guest TLB flush.
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host, and so architecturally, linear and combined
+ * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
+ * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
+ * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
+ * is required if VPID is disabled in KVM, as a TLB flush (there are no
+ * VPIDs) still occurs from L1's perspective, and KVM may need to
+ * synchronize the MMU in response to the guest TLB flush.
*
* Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
* EPT is a special snowflake, as guest-physical mappings aren't
@@ -2315,6 +2318,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
+ /*
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host. Emulate this behavior by using vpid01 for L2
+ * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
+ * and VM-Exit are architecturally required to flush VPID=0, but *only*
+ * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
+ * required flushes), but doing so would cause KVM to over-flush. E.g.
+ * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
+ * and then runs L2 X again, then KVM can and should retain TLB entries
+ * for VPID12=1.
+ */
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
@@ -5950,6 +5964,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ /*
+ * Always flush the effective vpid02, i.e. never flush the current VPID
+ * and never explicitly flush vpid01. INVVPID targets a VPID, not a
+ * VMCS, and so whether or not the current vmcs12 has VPID enabled is
+ * irrelevant (and there may not be a loaded vmcs12).
+ */
vpid02 = nested_get_vpid02(vcpu);
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 81ed596e4454..d28618e9277e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -217,9 +217,11 @@ module_param(ple_window_shrink, uint, 0444);
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
-/* Default is SYSTEM mode, 1 for host-guest mode */
+/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
+#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
+#endif
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
@@ -3216,7 +3218,7 @@ void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
return nested_get_vpid02(vcpu);
return to_vmx(vcpu)->vpid;
}
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d478aafa02c9..23e0b71b991e 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -2471,7 +2471,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
int r;
unsigned int num_locks;
struct dm_bufio_client *c;
- char slab_name[27];
+ char slab_name[64];
+ static atomic_t seqno = ATOMIC_INIT(0);
if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
DMERR("%s: block size not specified or is not multiple of 512b", __func__);
@@ -2522,7 +2523,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
(block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size);
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u-%u",
+ block_size, atomic_inc_return(&seqno));
c->slab_cache = kmem_cache_create(slab_name, block_size, align,
SLAB_RECLAIM_ACCOUNT, NULL);
if (!c->slab_cache) {
@@ -2531,9 +2533,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
}
}
if (aux_size)
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size);
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u-%u",
+ aux_size, atomic_inc_return(&seqno));
else
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer");
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u",
+ atomic_inc_return(&seqno));
c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
0, SLAB_RECLAIM_ACCOUNT, NULL);
if (!c->slab_buffer) {
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
index 9c5308298cf1..f3051bd7d2df 100644
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -11,12 +11,6 @@
#define DM_MSG_PREFIX "dm-background-tracker"
-struct bt_work {
- struct list_head list;
- struct rb_node node;
- struct policy_work work;
-};
-
struct background_tracker {
unsigned int max_work;
atomic_t pending_promotes;
@@ -26,10 +20,10 @@ struct background_tracker {
struct list_head issued;
struct list_head queued;
struct rb_root pending;
-
- struct kmem_cache *work_cache;
};
+struct kmem_cache *btracker_work_cache = NULL;
+
struct background_tracker *btracker_create(unsigned int max_work)
{
struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
@@ -48,12 +42,6 @@ struct background_tracker *btracker_create(unsigned int max_work)
INIT_LIST_HEAD(&b->queued);
b->pending = RB_ROOT;
- b->work_cache = KMEM_CACHE(bt_work, 0);
- if (!b->work_cache) {
- DMERR("couldn't create mempool for background work items");
- kfree(b);
- b = NULL;
- }
return b;
}
@@ -66,10 +54,9 @@ void btracker_destroy(struct background_tracker *b)
BUG_ON(!list_empty(&b->issued));
list_for_each_entry_safe (w, tmp, &b->queued, list) {
list_del(&w->list);
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
}
- kmem_cache_destroy(b->work_cache);
kfree(b);
}
EXPORT_SYMBOL_GPL(btracker_destroy);
@@ -180,7 +167,7 @@ static struct bt_work *alloc_work(struct background_tracker *b)
if (max_work_reached(b))
return NULL;
- return kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+ return kmem_cache_alloc(btracker_work_cache, GFP_NOWAIT);
}
int btracker_queue(struct background_tracker *b,
@@ -203,7 +190,7 @@ int btracker_queue(struct background_tracker *b,
* There was a race, we'll just ignore this second
* bit of work for the same oblock.
*/
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
return -EINVAL;
}
@@ -244,7 +231,7 @@ void btracker_complete(struct background_tracker *b,
update_stats(b, &w->work, -1);
rb_erase(&w->node, &b->pending);
list_del(&w->list);
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
}
EXPORT_SYMBOL_GPL(btracker_complete);
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h
index 5b8f5c667b81..09c8fc59f7bb 100644
--- a/drivers/md/dm-cache-background-tracker.h
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -26,6 +26,14 @@
* protected with a spinlock.
*/
+struct bt_work {
+ struct list_head list;
+ struct rb_node node;
+ struct policy_work work;
+};
+
+extern struct kmem_cache *btracker_work_cache;
+
struct background_work;
struct background_tracker;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 40709310e327..849eb6333e98 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -10,6 +10,7 @@
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include "dm-io-tracker.h"
+#include "dm-cache-background-tracker.h"
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
@@ -2263,7 +2264,7 @@ static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
/*----------------------------------------------------------------*/
-static struct kmem_cache *migration_cache;
+static struct kmem_cache *migration_cache = NULL;
#define NOT_CORE_OPTION 1
@@ -3445,22 +3446,36 @@ static int __init dm_cache_init(void)
int r;
migration_cache = KMEM_CACHE(dm_cache_migration, 0);
- if (!migration_cache)
- return -ENOMEM;
+ if (!migration_cache) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ btracker_work_cache = kmem_cache_create("dm_cache_bt_work",
+ sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL);
+ if (!btracker_work_cache) {
+ r = -ENOMEM;
+ goto err;
+ }
r = dm_register_target(&cache_target);
if (r) {
- kmem_cache_destroy(migration_cache);
- return r;
+ goto err;
}
return 0;
+
+err:
+ kmem_cache_destroy(migration_cache);
+ kmem_cache_destroy(btracker_work_cache);
+ return r;
}
static void __exit dm_cache_exit(void)
{
dm_unregister_target(&cache_target);
kmem_cache_destroy(migration_cache);
+ kmem_cache_destroy(btracker_work_cache);
}
module_init(dm_cache_init);
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
index 472daa588a9d..d5507b63b6cd 100644
--- a/drivers/vdpa/ifcvf/ifcvf_base.c
+++ b/drivers/vdpa/ifcvf/ifcvf_base.c
@@ -108,7 +108,7 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev)
u32 i;
ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
- if (ret < 0) {
+ if (ret) {
IFCVF_ERR(pdev, "Failed to read PCI capability list\n");
return -EIO;
}
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 2dd21e0b399e..7d0c83b5b071 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -373,7 +373,7 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr
struct page *pg;
unsigned int nsg;
int sglen;
- u64 pa;
+ u64 pa, offset;
u64 paend;
struct scatterlist *sg;
struct device *dma = mvdev->vdev.dma_dev;
@@ -396,8 +396,10 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr
sg = mr->sg_head.sgl;
for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1);
map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) {
- paend = map->addr + maplen(map, mr);
- for (pa = map->addr; pa < paend; pa += sglen) {
+ offset = mr->start > map->start ? mr->start - map->start : 0;
+ pa = map->addr + offset;
+ paend = map->addr + offset + maplen(map, mr);
+ for (; pa < paend; pa += sglen) {
pg = pfn_to_page(__phys_to_pfn(pa));
if (!sg) {
mlx5_vdpa_warn(mvdev, "sg null. start 0x%llx, end 0x%llx\n",
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index dee019977716..5f581e71e201 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3963,28 +3963,28 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
mvdev->vdev.dma_dev = &mdev->pdev->dev;
err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
if (err)
- goto err_mpfs;
+ goto err_alloc;
err = mlx5_vdpa_init_mr_resources(mvdev);
if (err)
- goto err_res;
+ goto err_alloc;
if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
err = mlx5_vdpa_create_dma_mr(mvdev);
if (err)
- goto err_mr_res;
+ goto err_alloc;
}
err = alloc_fixed_resources(ndev);
if (err)
- goto err_mr;
+ goto err_alloc;
ndev->cvq_ent.mvdev = mvdev;
INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
if (!mvdev->wq) {
err = -ENOMEM;
- goto err_res2;
+ goto err_alloc;
}
mvdev->vdev.mdev = &mgtdev->mgtdev;
@@ -4010,17 +4010,6 @@ err_setup_vq_res:
_vdpa_unregister_device(&mvdev->vdev);
err_reg:
destroy_workqueue(mvdev->wq);
-err_res2:
- free_fixed_resources(ndev);
-err_mr:
- mlx5_vdpa_clean_mrs(mvdev);
-err_mr_res:
- mlx5_vdpa_destroy_mr_resources(mvdev);
-err_res:
- mlx5_vdpa_free_resources(&ndev->mvdev);
-err_mpfs:
- if (!is_zero_ether_addr(config->mac))
- mlx5_mpfs_del_mac(pfmdev, config->mac);
err_alloc:
put_device(&mvdev->vdev.dev);
return err;
diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c
index 99428a04068d..c8b74980dbd1 100644
--- a/drivers/vdpa/solidrun/snet_main.c
+++ b/drivers/vdpa/solidrun/snet_main.c
@@ -555,7 +555,7 @@ static const struct vdpa_config_ops snet_config_ops = {
static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
{
- char name[50];
+ char *name;
int ret, i, mask = 0;
/* We don't know which BAR will be used to communicate..
* We will map every bar with len > 0.
@@ -573,7 +573,10 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
return -ENODEV;
}
- snprintf(name, sizeof(name), "psnet[%s]-bars", pci_name(pdev));
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "psnet[%s]-bars", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
ret = pcim_iomap_regions(pdev, mask, name);
if (ret) {
SNET_ERR(pdev, "Failed to request and map PCI BARs\n");
@@ -590,10 +593,13 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
static int snet_open_vf_bar(struct pci_dev *pdev, struct snet *snet)
{
- char name[50];
+ char *name;
int ret;
- snprintf(name, sizeof(name), "snet[%s]-bar", pci_name(pdev));
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "snet[%s]-bars", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
/* Request and map BAR */
ret = pcim_iomap_regions(pdev, BIT(snet->psnet->cfg.vf_bar), name);
if (ret) {
diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c
index ac4ab22f7d8b..16380764275e 100644
--- a/drivers/vdpa/virtio_pci/vp_vdpa.c
+++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
@@ -612,7 +612,11 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto mdev_err;
}
- mdev_id = kzalloc(sizeof(struct virtio_device_id), GFP_KERNEL);
+ /*
+ * id_table should be a null terminated array, so allocate one additional
+ * entry here, see vdpa_mgmtdev_get_classes().
+ */
+ mdev_id = kcalloc(2, sizeof(struct virtio_device_id), GFP_KERNEL);
if (!mdev_id) {
err = -ENOMEM;
goto mdev_id_err;
@@ -632,8 +636,8 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto probe_err;
}
- mdev_id->device = mdev->id.device;
- mdev_id->vendor = mdev->id.vendor;
+ mdev_id[0].device = mdev->id.device;
+ mdev_id[0].vendor = mdev->id.vendor;
mgtdev->id_table = mdev_id;
mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev);
mgtdev->supported_features = vp_modern_get_features(mdev);
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index c44d8ba00c02..88074451dd61 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -24,6 +24,16 @@ MODULE_PARM_DESC(force_legacy,
"Force legacy mode for transitional virtio 1 devices");
#endif
+bool vp_is_avq(struct virtio_device *vdev, unsigned int index)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
+ return false;
+
+ return index == vp_dev->admin_vq.vq_index;
+}
+
/* wait for pending irq handlers */
void vp_synchronize_vectors(struct virtio_device *vdev)
{
@@ -234,10 +244,9 @@ out_info:
return vq;
}
-static void vp_del_vq(struct virtqueue *vq)
+static void vp_del_vq(struct virtqueue *vq, struct virtio_pci_vq_info *info)
{
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
- struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
unsigned long flags;
/*
@@ -258,13 +267,16 @@ static void vp_del_vq(struct virtqueue *vq)
void vp_del_vqs(struct virtio_device *vdev)
{
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtio_pci_vq_info *info;
struct virtqueue *vq, *n;
int i;
list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
- if (vp_dev->per_vq_vectors) {
- int v = vp_dev->vqs[vq->index]->msix_vector;
+ info = vp_is_avq(vdev, vq->index) ? vp_dev->admin_vq.info :
+ vp_dev->vqs[vq->index];
+ if (vp_dev->per_vq_vectors) {
+ int v = info->msix_vector;
if (v != VIRTIO_MSI_NO_VECTOR &&
!vp_is_slow_path_vector(v)) {
int irq = pci_irq_vector(vp_dev->pci_dev, v);
@@ -273,7 +285,7 @@ void vp_del_vqs(struct virtio_device *vdev)
free_irq(irq, vq);
}
}
- vp_del_vq(vq);
+ vp_del_vq(vq, info);
}
vp_dev->per_vq_vectors = false;
@@ -354,7 +366,7 @@ vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx,
vring_interrupt, 0,
vp_dev->msix_names[msix_vec], vq);
if (err) {
- vp_del_vq(vq);
+ vp_del_vq(vq, *p_info);
return ERR_PTR(err);
}
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 1d9c49947f52..8beecf23ec85 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -178,6 +178,7 @@ struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev);
#define VIRTIO_ADMIN_CMD_BITMAP 0
#endif
+bool vp_is_avq(struct virtio_device *vdev, unsigned int index);
void vp_modern_avq_done(struct virtqueue *vq);
int vp_modern_admin_cmd_exec(struct virtio_device *vdev,
struct virtio_admin_cmd *cmd);
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 9193c30d640a..4fbcbc7a9ae1 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -43,16 +43,6 @@ static int vp_avq_index(struct virtio_device *vdev, u16 *index, u16 *num)
return 0;
}
-static bool vp_is_avq(struct virtio_device *vdev, unsigned int index)
-{
- struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-
- if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
- return false;
-
- return index == vp_dev->admin_vq.vq_index;
-}
-
void vp_modern_avq_done(struct virtqueue *vq)
{
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
@@ -245,7 +235,7 @@ static void vp_modern_avq_cleanup(struct virtio_device *vdev)
if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
return;
- vq = vp_dev->vqs[vp_dev->admin_vq.vq_index]->vq;
+ vq = vp_dev->admin_vq.info->vq;
if (!vq)
return;
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 57b4af5ad646..501ad7be5174 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -68,7 +68,6 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
goto failed;
}
memset(bh->b_data, 0, i_blocksize(inode));
- bh->b_bdev = inode->i_sb->s_bdev;
bh->b_blocknr = blocknr;
set_buffer_mapped(bh);
set_buffer_uptodate(bh);
@@ -133,7 +132,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
goto found;
}
set_buffer_mapped(bh);
- bh->b_bdev = inode->i_sb->s_bdev;
bh->b_blocknr = pblocknr; /* set block address for read */
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1c9ae36a03ab..ace22253fed0 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -83,10 +83,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
goto out;
}
- if (!buffer_mapped(bh)) {
- bh->b_bdev = inode->i_sb->s_bdev;
+ if (!buffer_mapped(bh))
set_buffer_mapped(bh);
- }
bh->b_blocknr = pbn;
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index ceb7dc0b5bad..2db6350b5ac2 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -89,7 +89,6 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
if (buffer_uptodate(bh))
goto failed_bh;
- bh->b_bdev = sb->s_bdev;
err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
if (likely(!err)) {
get_bh(bh);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 10def4b55995..6dd8b854cd1f 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -39,7 +39,6 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio,
first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
bh = get_nth_bh(bh, block - first_block);
- touch_buffer(bh);
wait_on_buffer(bh);
return bh;
}
@@ -64,6 +63,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
folio_put(folio);
return NULL;
}
+ bh->b_bdev = inode->i_sb->s_bdev;
return bh;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3d404624bb96..c79b4291777f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2319,6 +2319,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct ocfs2_blockcheck_stats *stats)
{
int status = -EAGAIN;
+ u32 blksz_bits;
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
@@ -2333,11 +2334,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
goto out;
}
status = -EINVAL;
- if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+ /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */
+ blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
+ if (blksz_bits < 9 || blksz_bits > 12) {
mlog(ML_ERROR, "found superblock with incorrect block "
- "size: found %u, should be %u\n",
- 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
- blksz);
+ "size bits: found %u, should be 9, 10, 11, or 12\n",
+ blksz_bits);
+ } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) {
+ mlog(ML_ERROR, "found superblock with incorrect block "
+ "size: found %u, should be %u\n", 1 << blksz_bits, blksz);
} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
OCFS2_MAJOR_REV_LEVEL ||
le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 34d2da05f2f1..e1b41554a5fb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1760,8 +1760,9 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
-static inline void count_objcg_event(struct obj_cgroup *objcg,
- enum vm_event_item idx)
+static inline void count_objcg_events(struct obj_cgroup *objcg,
+ enum vm_event_item idx,
+ unsigned long count)
{
struct mem_cgroup *memcg;
@@ -1770,7 +1771,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- count_memcg_events(memcg, idx, 1);
+ count_memcg_events(memcg, idx, count);
rcu_read_unlock();
}
@@ -1825,8 +1826,9 @@ static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
return NULL;
}
-static inline void count_objcg_event(struct obj_cgroup *objcg,
- enum vm_event_item idx)
+static inline void count_objcg_events(struct obj_cgroup *objcg,
+ enum vm_event_item idx,
+ unsigned long count)
{
}
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index aed952d04132..f70d0958095c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -134,6 +134,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
+ SWPIN_ZERO,
+ SWPOUT_ZERO,
#ifdef CONFIG_KSM
KSM_SWPIN_COPY,
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 719e0ed1e976..a1c353a62c56 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5920,12 +5920,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
#ifdef CONFIG_SCHED_CLASS_EXT
/*
- * SCX requires a balance() call before every pick_next_task() including
- * when waking up from SCHED_IDLE. If @start_class is below SCX, start
- * from SCX instead.
+ * SCX requires a balance() call before every pick_task() including when
+ * waking up from SCHED_IDLE. If @start_class is below SCX, start from
+ * SCX instead. Also, set a flag to detect missing balance() call.
*/
- if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
+ if (scx_enabled()) {
+ rq->scx.flags |= SCX_RQ_BAL_PENDING;
+ if (sched_class_above(&ext_sched_class, start_class))
+ start_class = &ext_sched_class;
+ }
#endif
/*
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b5f4b1a5ae98..51b7e04879d7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+ rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
unlikely(rq->scx.cpu_released)) {
@@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
{
struct task_struct *prev = rq->curr;
struct task_struct *p;
+ bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ bool kick_idle = false;
/*
- * If balance_scx() is telling us to keep running @prev, replenish slice
- * if necessary and keep running @prev. Otherwise, pop the first one
- * from the local DSQ.
- *
* WORKAROUND:
*
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* which then ends up calling pick_task_scx() without preceding
* balance_scx().
*
- * For now, ignore cases where $prev is not on SCX. This isn't great and
- * can theoretically lead to stalls. However, for switch_all cases, this
- * happens only while a BPF scheduler is being loaded or unloaded, and,
- * for partial cases, fair will likely keep triggering this CPU.
+ * Keep running @prev if possible and avoid stalling from entering idle
+ * without balancing.
*
- * Once fair is fixed, restore WARN_ON_ONCE().
+ * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
+ * if pick_task_scx() is called without preceding balance_scx().
*/
- if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
- prev->sched_class == &ext_sched_class) {
+ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
+ if (prev_on_scx) {
+ keep_prev = true;
+ } else {
+ keep_prev = false;
+ kick_idle = true;
+ }
+ } else if (unlikely(keep_prev && !prev_on_scx)) {
+ /* only allowed during transitions */
+ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ keep_prev = false;
+ }
+
+ /*
+ * If balance_scx() is telling us to keep running @prev, replenish slice
+ * if necessary and keep running @prev. Otherwise, pop the first one
+ * from the local DSQ.
+ */
+ if (keep_prev) {
p = prev;
if (!p->scx.slice)
p->scx.slice = SCX_SLICE_DFL;
} else {
p = first_local_task(rq);
- if (!p)
+ if (!p) {
+ if (kick_idle)
+ scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
return NULL;
+ }
if (unlikely(!p->scx.slice)) {
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
@@ -4979,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
- pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
return -EINVAL;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6c54a57275cc..c03b3d7b320e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -751,8 +751,9 @@ enum scx_rq_flags {
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
- SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
- SCX_RQ_BYPASSING = 1 << 3,
+ SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
+ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
+ SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,
diff --git a/mm/gup.c b/mm/gup.c
index 4637dab7b54f..ad0c8922dac3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2273,20 +2273,57 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
#ifdef CONFIG_MIGRATION
+
+/*
+ * An array of either pages or folios ("pofs"). Although it may seem tempting to
+ * avoid this complication, by simply interpreting a list of folios as a list of
+ * pages, that approach won't work in the longer term, because eventually the
+ * layouts of struct page and struct folio will become completely different.
+ * Furthermore, this pof approach avoids excessive page_folio() calls.
+ */
+struct pages_or_folios {
+ union {
+ struct page **pages;
+ struct folio **folios;
+ void **entries;
+ };
+ bool has_folios;
+ long nr_entries;
+};
+
+static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
+{
+ if (pofs->has_folios)
+ return pofs->folios[i];
+ return page_folio(pofs->pages[i]);
+}
+
+static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
+{
+ pofs->entries[i] = NULL;
+}
+
+static void pofs_unpin(struct pages_or_folios *pofs)
+{
+ if (pofs->has_folios)
+ unpin_folios(pofs->folios, pofs->nr_entries);
+ else
+ unpin_user_pages(pofs->pages, pofs->nr_entries);
+}
+
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
static unsigned long collect_longterm_unpinnable_folios(
- struct list_head *movable_folio_list,
- unsigned long nr_folios,
- struct folio **folios)
+ struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
{
unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = folios[i];
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
if (folio == prev_folio)
continue;
@@ -2327,16 +2364,15 @@ static unsigned long collect_longterm_unpinnable_folios(
* Returns -EAGAIN if all folios were successfully migrated or -errno for
* failure (or partial success).
*/
-static int migrate_longterm_unpinnable_folios(
- struct list_head *movable_folio_list,
- unsigned long nr_folios,
- struct folio **folios)
+static int
+migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
{
int ret;
unsigned long i;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = folios[i];
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
if (folio_is_device_coherent(folio)) {
/*
@@ -2344,7 +2380,7 @@ static int migrate_longterm_unpinnable_folios(
* convert the pin on the source folio to a normal
* reference.
*/
- folios[i] = NULL;
+ pofs_clear_entry(pofs, i);
folio_get(folio);
gup_put_folio(folio, 1, FOLL_PIN);
@@ -2363,8 +2399,8 @@ static int migrate_longterm_unpinnable_folios(
* calling folio_isolate_lru() which takes a reference so the
* folio won't be freed if it's migrating.
*/
- unpin_folio(folios[i]);
- folios[i] = NULL;
+ unpin_folio(folio);
+ pofs_clear_entry(pofs, i);
}
if (!list_empty(movable_folio_list)) {
@@ -2387,12 +2423,26 @@ static int migrate_longterm_unpinnable_folios(
return -EAGAIN;
err:
- unpin_folios(folios, nr_folios);
+ pofs_unpin(pofs);
putback_movable_pages(movable_folio_list);
return ret;
}
+static long
+check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
+{
+ LIST_HEAD(movable_folio_list);
+ unsigned long collected;
+
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
+}
+
/*
* Check whether all folios are *allowed* to be pinned indefinitely (long term).
* Rather confusingly, all folios in the range are required to be pinned via
@@ -2417,16 +2467,13 @@ err:
static long check_and_migrate_movable_folios(unsigned long nr_folios,
struct folio **folios)
{
- unsigned long collected;
- LIST_HEAD(movable_folio_list);
+ struct pages_or_folios pofs = {
+ .folios = folios,
+ .has_folios = true,
+ .nr_entries = nr_folios,
+ };
- collected = collect_longterm_unpinnable_folios(&movable_folio_list,
- nr_folios, folios);
- if (!collected)
- return 0;
-
- return migrate_longterm_unpinnable_folios(&movable_folio_list,
- nr_folios, folios);
+ return check_and_migrate_movable_pages_or_folios(&pofs);
}
/*
@@ -2436,22 +2483,13 @@ static long check_and_migrate_movable_folios(unsigned long nr_folios,
static long check_and_migrate_movable_pages(unsigned long nr_pages,
struct page **pages)
{
- struct folio **folios;
- long i, ret;
+ struct pages_or_folios pofs = {
+ .pages = pages,
+ .has_folios = false,
+ .nr_entries = nr_pages,
+ };
- folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL);
- if (!folios) {
- unpin_user_pages(pages, nr_pages);
- return -ENOMEM;
- }
-
- for (i = 0; i < nr_pages; i++)
- folios[i] = page_folio(pages[i]);
-
- ret = check_and_migrate_movable_folios(nr_pages, folios);
-
- kfree(folios);
- return ret;
+ return check_and_migrate_movable_pages_or_folios(&pofs);
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 03fd4bc39ea1..5734d5d5060f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3790,7 +3790,9 @@ next:
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
- if (!did_split && !folio_test_partially_mapped(folio)) {
+ if (did_split) {
+ ; /* folio already removed from list */
+ } else if (!folio_test_partially_mapped(folio)) {
list_del_init(&folio->_deferred_list);
removed++;
} else {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06df2af97415..53db98d2c4a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -431,6 +431,10 @@ static const unsigned int memcg_vm_event_stat[] = {
PGDEACTIVATE,
PGLAZYFREE,
PGLAZYFREED,
+#ifdef CONFIG_SWAP
+ SWPIN_ZERO,
+ SWPOUT_ZERO,
+#endif
#ifdef CONFIG_ZSWAP
ZSWPIN,
ZSWPOUT,
diff --git a/mm/nommu.c b/mm/nommu.c
index e9b5f527ab5b..9cb6e99215e2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -573,7 +573,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
- if (vma_iter_prealloc(&vmi, vma)) {
+ if (vma_iter_prealloc(&vmi, NULL)) {
pr_warn("Allocation of vma tree for process %d failed\n",
current->pid);
return -ENOMEM;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c6c7bb3ea71b..216fbbfbedcf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1048,6 +1048,7 @@ __always_inline bool free_pages_prepare(struct page *page,
bool skip_kasan_poison = should_skip_kasan_poison(page);
bool init = want_init_on_free();
bool compound = PageCompound(page);
+ struct folio *folio = page_folio(page);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1057,6 +1058,20 @@ __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
+ /*
+ * In rare cases, when truncation or holepunching raced with
+ * munlock after VM_LOCKED was cleared, Mlocked may still be
+ * found set here. This does not indicate a problem, unless
+ * "unevictable_pgs_cleared" appears worryingly large.
+ */
+ if (unlikely(folio_test_mlocked(folio))) {
+ long nr_pages = folio_nr_pages(folio);
+
+ __folio_clear_mlocked(folio);
+ zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
+ }
+
if (unlikely(PageHWPoison(page)) && !order) {
/* Do not let hwpoison pages hit pcplists/buddy */
reset_page_owner(page, order);
diff --git a/mm/page_io.c b/mm/page_io.c
index 69536a2b3c13..01749b99fb54 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -204,7 +204,9 @@ static bool is_folio_zero_filled(struct folio *folio)
static void swap_zeromap_folio_set(struct folio *folio)
{
+ struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ int nr_pages = folio_nr_pages(folio);
swp_entry_t entry;
unsigned int i;
@@ -212,6 +214,12 @@ static void swap_zeromap_folio_set(struct folio *folio)
entry = page_swap_entry(folio_page(folio, i));
set_bit(swp_offset(entry), sis->zeromap);
}
+
+ count_vm_events(SWPOUT_ZERO, nr_pages);
+ if (objcg) {
+ count_objcg_events(objcg, SWPOUT_ZERO, nr_pages);
+ obj_cgroup_put(objcg);
+ }
}
static void swap_zeromap_folio_clear(struct folio *folio)
@@ -503,6 +511,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
static bool swap_read_folio_zeromap(struct folio *folio)
{
int nr_pages = folio_nr_pages(folio);
+ struct obj_cgroup *objcg;
bool is_zeromap;
/*
@@ -517,6 +526,13 @@ static bool swap_read_folio_zeromap(struct folio *folio)
if (!is_zeromap)
return false;
+ objcg = get_obj_cgroup_from_folio(folio);
+ count_vm_events(SWPIN_ZERO, nr_pages);
+ if (objcg) {
+ count_objcg_events(objcg, SWPIN_ZERO, nr_pages);
+ obj_cgroup_put(objcg);
+ }
+
folio_zero_range(folio, 0, folio_size(folio));
folio_mark_uptodate(folio);
return true;
diff --git a/mm/swap.c b/mm/swap.c
index b8e3259ea2c4..59f30a981c6f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,20 +78,6 @@ static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
lruvec_del_folio(*lruvecp, folio);
__folio_clear_lru_flags(folio);
}
-
- /*
- * In rare cases, when truncation or holepunching raced with
- * munlock after VM_LOCKED was cleared, Mlocked may still be
- * found set here. This does not indicate a problem, unless
- * "unevictable_pgs_cleared" appears worryingly large.
- */
- if (unlikely(folio_test_mlocked(folio))) {
- long nr_pages = folio_nr_pages(folio);
-
- __folio_clear_mlocked(folio);
- zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
- count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
- }
}
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 46bd4b1a3c07..9c85bd46ab7f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -929,7 +929,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
si->highest_bit = 0;
del_from_avail_list(si);
- if (vm_swap_full())
+ if (si->cluster_info && vm_swap_full())
schedule_work(&si->reclaim_work);
}
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b5a4cea423e1..ac6a5aa34eab 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1415,6 +1415,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
+ "swpin_zero",
+ "swpout_zero",
#ifdef CONFIG_KSM
"ksm_swpin_copy",
#endif
diff --git a/mm/zswap.c b/mm/zswap.c
index 162013952074..0030ce8fecfc 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1053,7 +1053,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
count_vm_event(ZSWPWB);
if (entry->objcg)
- count_objcg_event(entry->objcg, ZSWPWB);
+ count_objcg_events(entry->objcg, ZSWPWB, 1);
zswap_entry_free(entry);
@@ -1483,7 +1483,7 @@ bool zswap_store(struct folio *folio)
if (objcg) {
obj_cgroup_charge_zswap(objcg, entry->length);
- count_objcg_event(objcg, ZSWPOUT);
+ count_objcg_events(objcg, ZSWPOUT, 1);
}
/*
@@ -1577,7 +1577,7 @@ bool zswap_load(struct folio *folio)
count_vm_event(ZSWPIN);
if (entry->objcg)
- count_objcg_event(entry->objcg, ZSWPIN);
+ count_objcg_events(entry->objcg, ZSWPIN, 1);
if (swapcache) {
zswap_entry_free(entry);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index ccbd2bc0d210..fc5666c8298f 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1109,6 +1109,7 @@ void virtio_transport_destruct(struct vsock_sock *vsk)
struct virtio_vsock_sock *vvs = vsk->trans;
kfree(vvs);
+ vsk->trans = NULL;
}
EXPORT_SYMBOL_GPL(virtio_transport_destruct);
diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c
index f847e832ba14..57565dfd74a2 100644
--- a/samples/landlock/sandboxer.c
+++ b/samples/landlock/sandboxer.c
@@ -60,6 +60,25 @@ static inline int landlock_restrict_self(const int ruleset_fd,
#define ENV_SCOPED_NAME "LL_SCOPED"
#define ENV_DELIMITER ":"
+static int str2num(const char *numstr, __u64 *num_dst)
+{
+ char *endptr = NULL;
+ int err = 0;
+ __u64 num;
+
+ errno = 0;
+ num = strtoull(numstr, &endptr, 10);
+ if (errno != 0)
+ err = errno;
+ /* Was the string empty, or not entirely parsed successfully? */
+ else if ((*numstr == '\0') || (*endptr != '\0'))
+ err = EINVAL;
+ else
+ *num_dst = num;
+
+ return err;
+}
+
static int parse_path(char *env_path, const char ***const path_list)
{
int i, num_paths = 0;
@@ -160,7 +179,6 @@ static int populate_ruleset_net(const char *const env_var, const int ruleset_fd,
char *env_port_name, *env_port_name_next, *strport;
struct landlock_net_port_attr net_port = {
.allowed_access = allowed_access,
- .port = 0,
};
env_port_name = getenv(env_var);
@@ -171,7 +189,17 @@ static int populate_ruleset_net(const char *const env_var, const int ruleset_fd,
env_port_name_next = env_port_name;
while ((strport = strsep(&env_port_name_next, ENV_DELIMITER))) {
- net_port.port = atoi(strport);
+ __u64 port;
+
+ if (strcmp(strport, "") == 0)
+ continue;
+
+ if (str2num(strport, &port)) {
+ fprintf(stderr, "Failed to parse port at \"%s\"\n",
+ strport);
+ goto out_free_name;
+ }
+ net_port.port = port;
if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT,
&net_port, 0)) {
fprintf(stderr,
@@ -262,6 +290,44 @@ out_unset:
#define LANDLOCK_ABI_LAST 6
+#define XSTR(s) #s
+#define STR(s) XSTR(s)
+
+/* clang-format off */
+
+static const char help[] =
+ "usage: " ENV_FS_RO_NAME "=\"...\" " ENV_FS_RW_NAME "=\"...\" "
+ "[other environment variables] %1$s <cmd> [args]...\n"
+ "\n"
+ "Execute the given command in a restricted environment.\n"
+ "Multi-valued settings (lists of ports, paths, scopes) are colon-delimited.\n"
+ "\n"
+ "Mandatory settings:\n"
+ "* " ENV_FS_RO_NAME ": paths allowed to be used in a read-only way\n"
+ "* " ENV_FS_RW_NAME ": paths allowed to be used in a read-write way\n"
+ "\n"
+ "Optional settings (when not set, their associated access check "
+ "is always allowed, which is different from an empty string which "
+ "means an empty list):\n"
+ "* " ENV_TCP_BIND_NAME ": ports allowed to bind (server)\n"
+ "* " ENV_TCP_CONNECT_NAME ": ports allowed to connect (client)\n"
+ "* " ENV_SCOPED_NAME ": actions denied on the outside of the landlock domain\n"
+ " - \"a\" to restrict opening abstract unix sockets\n"
+ " - \"s\" to restrict sending signals\n"
+ "\n"
+ "Example:\n"
+ ENV_FS_RO_NAME "=\"${PATH}:/lib:/usr:/proc:/etc:/dev/urandom\" "
+ ENV_FS_RW_NAME "=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" "
+ ENV_TCP_BIND_NAME "=\"9418\" "
+ ENV_TCP_CONNECT_NAME "=\"80:443\" "
+ ENV_SCOPED_NAME "=\"a:s\" "
+ "%1$s bash -i\n"
+ "\n"
+ "This sandboxer can use Landlock features up to ABI version "
+ STR(LANDLOCK_ABI_LAST) ".\n";
+
+/* clang-format on */
+
int main(const int argc, char *const argv[], char *const *const envp)
{
const char *cmd_path;
@@ -280,47 +346,7 @@ int main(const int argc, char *const argv[], char *const *const envp)
};
if (argc < 2) {
- fprintf(stderr,
- "usage: %s=\"...\" %s=\"...\" %s=\"...\" %s=\"...\" %s=\"...\" %s "
- "<cmd> [args]...\n\n",
- ENV_FS_RO_NAME, ENV_FS_RW_NAME, ENV_TCP_BIND_NAME,
- ENV_TCP_CONNECT_NAME, ENV_SCOPED_NAME, argv[0]);
- fprintf(stderr,
- "Execute a command in a restricted environment.\n\n");
- fprintf(stderr,
- "Environment variables containing paths and ports "
- "each separated by a colon:\n");
- fprintf(stderr,
- "* %s: list of paths allowed to be used in a read-only way.\n",
- ENV_FS_RO_NAME);
- fprintf(stderr,
- "* %s: list of paths allowed to be used in a read-write way.\n\n",
- ENV_FS_RW_NAME);
- fprintf(stderr,
- "Environment variables containing ports are optional "
- "and could be skipped.\n");
- fprintf(stderr,
- "* %s: list of ports allowed to bind (server).\n",
- ENV_TCP_BIND_NAME);
- fprintf(stderr,
- "* %s: list of ports allowed to connect (client).\n",
- ENV_TCP_CONNECT_NAME);
- fprintf(stderr, "* %s: list of scoped IPCs.\n",
- ENV_SCOPED_NAME);
- fprintf(stderr,
- "\nexample:\n"
- "%s=\"${PATH}:/lib:/usr:/proc:/etc:/dev/urandom\" "
- "%s=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" "
- "%s=\"9418\" "
- "%s=\"80:443\" "
- "%s=\"a:s\" "
- "%s bash -i\n\n",
- ENV_FS_RO_NAME, ENV_FS_RW_NAME, ENV_TCP_BIND_NAME,
- ENV_TCP_CONNECT_NAME, ENV_SCOPED_NAME, argv[0]);
- fprintf(stderr,
- "This sandboxer can use Landlock features "
- "up to ABI version %d.\n",
- LANDLOCK_ABI_LAST);
+ fprintf(stderr, help, argv[0]);
return 1;
}
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 6924ed508ebd..377e57e9084f 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -1084,7 +1084,8 @@ static void evm_file_release(struct file *file)
if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE))
return;
- if (iint && atomic_read(&inode->i_writecount) == 1)
+ if (iint && iint->flags & EVM_NEW_FILE &&
+ atomic_read(&inode->i_writecount) == 1)
iint->flags &= ~EVM_NEW_FILE;
}
diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c
index 4183956c53af..0e627eac9c33 100644
--- a/security/integrity/ima/ima_template_lib.c
+++ b/security/integrity/ima/ima_template_lib.c
@@ -318,15 +318,21 @@ static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize,
hash_algo_name[hash_algo]);
}
- if (digest)
+ if (digest) {
memcpy(buffer + offset, digest, digestsize);
- else
+ } else {
/*
* If digest is NULL, the event being recorded is a violation.
* Make room for the digest by increasing the offset by the
- * hash algorithm digest size.
+ * hash algorithm digest size. If the hash algorithm is not
+ * specified increase the offset by IMA_DIGEST_SIZE which
+ * fits SHA1 or MD5
*/
- offset += hash_digest_size[hash_algo];
+ if (hash_algo < HASH_ALGO__LAST)
+ offset += hash_digest_size[hash_algo];
+ else
+ offset += IMA_DIGEST_SIZE;
+ }
return ima_write_template_field_data(buffer, offset + digestsize,
fmt, field_data);
diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h
index 660f76cb69d3..c2c2da691123 100644
--- a/security/integrity/integrity.h
+++ b/security/integrity/integrity.h
@@ -37,6 +37,8 @@ struct evm_ima_xattr_data {
);
u8 data[];
} __packed;
+static_assert(offsetof(struct evm_ima_xattr_data, data) == sizeof(struct evm_ima_xattr_data_hdr),
+ "struct member likely outside of __struct_group()");
/* Only used in the EVM HMAC code. */
struct evm_xattr {
@@ -65,6 +67,8 @@ struct ima_digest_data {
);
u8 digest[];
} __packed;
+static_assert(offsetof(struct ima_digest_data, digest) == sizeof(struct ima_digest_data_hdr),
+ "struct member likely outside of __struct_group()");
/*
* Instead of wrapping the ima_digest_data struct inside a local structure
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 7d79fc8abe21..e31b97a9f175 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -389,37 +389,21 @@ static bool is_nouser_or_private(const struct dentry *dentry)
}
static access_mask_t
-get_raw_handled_fs_accesses(const struct landlock_ruleset *const domain)
-{
- access_mask_t access_dom = 0;
- size_t layer_level;
-
- for (layer_level = 0; layer_level < domain->num_layers; layer_level++)
- access_dom |=
- landlock_get_raw_fs_access_mask(domain, layer_level);
- return access_dom;
-}
-
-static access_mask_t
get_handled_fs_accesses(const struct landlock_ruleset *const domain)
{
/* Handles all initially denied by default access rights. */
- return get_raw_handled_fs_accesses(domain) |
+ return landlock_union_access_masks(domain).fs |
LANDLOCK_ACCESS_FS_INITIALLY_DENIED;
}
-static const struct landlock_ruleset *
-get_fs_domain(const struct landlock_ruleset *const domain)
-{
- if (!domain || !get_raw_handled_fs_accesses(domain))
- return NULL;
-
- return domain;
-}
+static const struct access_masks any_fs = {
+ .fs = ~0,
+};
static const struct landlock_ruleset *get_current_fs_domain(void)
{
- return get_fs_domain(landlock_get_current_domain());
+ return landlock_get_applicable_domain(landlock_get_current_domain(),
+ any_fs);
}
/*
@@ -1517,7 +1501,8 @@ static int hook_file_open(struct file *const file)
access_mask_t open_access_request, full_access_request, allowed_access,
optional_access;
const struct landlock_ruleset *const dom =
- get_fs_domain(landlock_cred(file->f_cred)->domain);
+ landlock_get_applicable_domain(
+ landlock_cred(file->f_cred)->domain, any_fs);
if (!dom)
return 0;
diff --git a/security/landlock/net.c b/security/landlock/net.c
index c8bcd29bde09..d5dcc4407a19 100644
--- a/security/landlock/net.c
+++ b/security/landlock/net.c
@@ -39,27 +39,9 @@ int landlock_append_net_rule(struct landlock_ruleset *const ruleset,
return err;
}
-static access_mask_t
-get_raw_handled_net_accesses(const struct landlock_ruleset *const domain)
-{
- access_mask_t access_dom = 0;
- size_t layer_level;
-
- for (layer_level = 0; layer_level < domain->num_layers; layer_level++)
- access_dom |= landlock_get_net_access_mask(domain, layer_level);
- return access_dom;
-}
-
-static const struct landlock_ruleset *get_current_net_domain(void)
-{
- const struct landlock_ruleset *const dom =
- landlock_get_current_domain();
-
- if (!dom || !get_raw_handled_net_accesses(dom))
- return NULL;
-
- return dom;
-}
+static const struct access_masks any_net = {
+ .net = ~0,
+};
static int current_check_access_socket(struct socket *const sock,
struct sockaddr *const address,
@@ -72,7 +54,9 @@ static int current_check_access_socket(struct socket *const sock,
struct landlock_id id = {
.type = LANDLOCK_KEY_NET_PORT,
};
- const struct landlock_ruleset *const dom = get_current_net_domain();
+ const struct landlock_ruleset *const dom =
+ landlock_get_applicable_domain(landlock_get_current_domain(),
+ any_net);
if (!dom)
return 0;
diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h
index 61bdbc550172..631e24d4ffe9 100644
--- a/security/landlock/ruleset.h
+++ b/security/landlock/ruleset.h
@@ -11,6 +11,7 @@
#include <linux/bitops.h>
#include <linux/build_bug.h>
+#include <linux/kernel.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/refcount.h>
@@ -47,6 +48,15 @@ struct access_masks {
access_mask_t scope : LANDLOCK_NUM_SCOPE;
};
+union access_masks_all {
+ struct access_masks masks;
+ u32 all;
+};
+
+/* Makes sure all fields are covered. */
+static_assert(sizeof(typeof_member(union access_masks_all, masks)) ==
+ sizeof(typeof_member(union access_masks_all, all)));
+
typedef u16 layer_mask_t;
/* Makes sure all layers can be checked. */
static_assert(BITS_PER_TYPE(layer_mask_t) >= LANDLOCK_MAX_NUM_LAYERS);
@@ -260,6 +270,61 @@ static inline void landlock_get_ruleset(struct landlock_ruleset *const ruleset)
refcount_inc(&ruleset->usage);
}
+/**
+ * landlock_union_access_masks - Return all access rights handled in the
+ * domain
+ *
+ * @domain: Landlock ruleset (used as a domain)
+ *
+ * Returns: an access_masks result of the OR of all the domain's access masks.
+ */
+static inline struct access_masks
+landlock_union_access_masks(const struct landlock_ruleset *const domain)
+{
+ union access_masks_all matches = {};
+ size_t layer_level;
+
+ for (layer_level = 0; layer_level < domain->num_layers; layer_level++) {
+ union access_masks_all layer = {
+ .masks = domain->access_masks[layer_level],
+ };
+
+ matches.all |= layer.all;
+ }
+
+ return matches.masks;
+}
+
+/**
+ * landlock_get_applicable_domain - Return @domain if it applies to (handles)
+ * at least one of the access rights specified
+ * in @masks
+ *
+ * @domain: Landlock ruleset (used as a domain)
+ * @masks: access masks
+ *
+ * Returns: @domain if any access rights specified in @masks is handled, or
+ * NULL otherwise.
+ */
+static inline const struct landlock_ruleset *
+landlock_get_applicable_domain(const struct landlock_ruleset *const domain,
+ const struct access_masks masks)
+{
+ const union access_masks_all masks_all = {
+ .masks = masks,
+ };
+ union access_masks_all merge = {};
+
+ if (!domain)
+ return NULL;
+
+ merge.masks = landlock_union_access_masks(domain);
+ if (merge.all & masks_all.all)
+ return domain;
+
+ return NULL;
+}
+
static inline void
landlock_add_fs_access_mask(struct landlock_ruleset *const ruleset,
const access_mask_t fs_access_mask,
@@ -296,18 +361,11 @@ landlock_add_scope_mask(struct landlock_ruleset *const ruleset,
}
static inline access_mask_t
-landlock_get_raw_fs_access_mask(const struct landlock_ruleset *const ruleset,
- const u16 layer_level)
-{
- return ruleset->access_masks[layer_level].fs;
-}
-
-static inline access_mask_t
landlock_get_fs_access_mask(const struct landlock_ruleset *const ruleset,
const u16 layer_level)
{
/* Handles all initially denied by default access rights. */
- return landlock_get_raw_fs_access_mask(ruleset, layer_level) |
+ return ruleset->access_masks[layer_level].fs |
LANDLOCK_ACCESS_FS_INITIALLY_DENIED;
}
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index f5a0e7182ec0..c097d356fa45 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -329,7 +329,7 @@ static int add_rule_path_beneath(struct landlock_ruleset *const ruleset,
return -ENOMSG;
/* Checks that allowed_access matches the @ruleset constraints. */
- mask = landlock_get_raw_fs_access_mask(ruleset, 0);
+ mask = ruleset->access_masks[0].fs;
if ((path_beneath_attr.allowed_access | mask) != mask)
return -EINVAL;
diff --git a/security/landlock/task.c b/security/landlock/task.c
index 4acbd7c40eee..dc7dab78392e 100644
--- a/security/landlock/task.c
+++ b/security/landlock/task.c
@@ -204,12 +204,17 @@ static bool is_abstract_socket(struct sock *const sock)
return false;
}
+static const struct access_masks unix_scope = {
+ .scope = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET,
+};
+
static int hook_unix_stream_connect(struct sock *const sock,
struct sock *const other,
struct sock *const newsk)
{
const struct landlock_ruleset *const dom =
- landlock_get_current_domain();
+ landlock_get_applicable_domain(landlock_get_current_domain(),
+ unix_scope);
/* Quick return for non-landlocked tasks. */
if (!dom)
@@ -225,7 +230,8 @@ static int hook_unix_may_send(struct socket *const sock,
struct socket *const other)
{
const struct landlock_ruleset *const dom =
- landlock_get_current_domain();
+ landlock_get_applicable_domain(landlock_get_current_domain(),
+ unix_scope);
if (!dom)
return 0;
@@ -243,6 +249,10 @@ static int hook_unix_may_send(struct socket *const sock,
return 0;
}
+static const struct access_masks signal_scope = {
+ .scope = LANDLOCK_SCOPE_SIGNAL,
+};
+
static int hook_task_kill(struct task_struct *const p,
struct kernel_siginfo *const info, const int sig,
const struct cred *const cred)
@@ -256,6 +266,7 @@ static int hook_task_kill(struct task_struct *const p,
} else {
dom = landlock_get_current_domain();
}
+ dom = landlock_get_applicable_domain(dom, signal_scope);
/* Quick return for non-landlocked tasks. */
if (!dom)
@@ -279,7 +290,8 @@ static int hook_file_send_sigiotask(struct task_struct *tsk,
/* Lock already held by send_sigio() and send_sigurg(). */
lockdep_assert_held(&fown->lock);
- dom = landlock_file(fown->file)->fown_domain;
+ dom = landlock_get_applicable_domain(
+ landlock_file(fown->file)->fown_domain, signal_scope);
/* Quick return for unowned socket. */
if (!dom)
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index 8bc626ede1c4..c4b3fdda9a0b 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -35,6 +35,6 @@ print(f'enabled : {read_static_key("__scx_ops_enabled")}')
print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all : {read_static_key("__scx_switched_all")}')
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
-print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}')
+print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
print(f'enable_seq : {read_atomic("scx_enable_seq")}')
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 156fbfae940f..48645a2e29da 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -241,16 +241,18 @@ CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
-fno-builtin-memcmp -fno-builtin-memcpy \
-fno-builtin-memset -fno-builtin-strnlen \
- -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
- -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
- -I$(<D) -Iinclude/$(ARCH_DIR) -I ../rseq -I.. $(EXTRA_CFLAGS) \
- $(KHDR_INCLUDES)
+ -fno-stack-protector -fno-PIE -fno-strict-aliasing \
+ -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_TOOL_ARCH_INCLUDE) \
+ -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH_DIR) \
+ -I ../rseq -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
ifeq ($(ARCH),s390)
CFLAGS += -march=z10
endif
ifeq ($(ARCH),x86)
+ifeq ($(shell echo "void foo(void) { }" | $(CC) -march=x86-64-v2 -x c - -c -o /dev/null 2>/dev/null; echo "$$?"),0)
CFLAGS += -march=x86-64-v2
endif
+endif
ifeq ($(ARCH),arm64)
tools_dir := $(top_srcdir)/tools
arm64_tools_dir := $(tools_dir)/arch/arm64/tools/
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
index ba0c8e996035..ce687f8d248f 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -134,7 +134,7 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm)
size);
}
- for (flag = 0; flag; flag <<= 1) {
+ for (flag = BIT(0); flag; flag <<= 1) {
fd = __vm_create_guest_memfd(vm, page_size, flag);
TEST_ASSERT(fd == -1 && errno == EINVAL,
"guest_memfd() with flag '0x%lx' should fail with EINVAL",
diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
index 089b8925b6b2..d7ac122820bf 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -200,7 +200,7 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
if (vmx->eptp_gpa) {
uint64_t ept_paddr;
struct eptPageTablePointer eptp = {
- .memory_type = VMX_BASIC_MEM_TYPE_WB,
+ .memory_type = X86_MEMTYPE_WB,
.page_walk_length = 3, /* + 1 */
.ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
.address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 989ffe0d047f..e3711beff7f3 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -417,7 +417,7 @@ static bool _guest_should_exit(void)
*/
static noinline void host_perform_sync(struct sync_area *sync)
{
- alarm(2);
+ alarm(10);
atomic_store_explicit(&sync->sync_flag, true, memory_order_release);
while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire))
diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c
index 60001c142ce9..432d5af15e66 100644
--- a/tools/testing/selftests/mm/hugetlb_dio.c
+++ b/tools/testing/selftests/mm/hugetlb_dio.c
@@ -44,6 +44,13 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
if (fd < 0)
ksft_exit_fail_perror("Error opening file\n");
+ /* Get the free huge pages before allocation */
+ free_hpage_b = get_free_hugepages();
+ if (free_hpage_b == 0) {
+ close(fd);
+ ksft_exit_skip("No free hugepage, exiting!\n");
+ }
+
/* Allocate a hugetlb page */
orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
if (orig_buffer == MAP_FAILED) {
diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c
index 43d3a6aa1dcf..b9591223437a 100644
--- a/tools/virtio/vringh_test.c
+++ b/tools/virtio/vringh_test.c
@@ -519,7 +519,7 @@ int main(int argc, char *argv[])
errx(1, "virtqueue_add_sgs: %i", err);
__kmalloc_fake = NULL;
- /* Host retreives it. */
+ /* Host retrieves it. */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));